11544Seschrock /*
21544Seschrock * CDDL HEADER START
31544Seschrock *
41544Seschrock * The contents of this file are subject to the terms of the
51544Seschrock * Common Development and Distribution License (the "License").
61544Seschrock * You may not use this file except in compliance with the License.
71544Seschrock *
81544Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91544Seschrock * or http://www.opensolaris.org/os/licensing.
101544Seschrock * See the License for the specific language governing permissions
111544Seschrock * and limitations under the License.
121544Seschrock *
131544Seschrock * When distributing Covered Code, include this CDDL HEADER in each
141544Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151544Seschrock * If applicable, add the following below this CDDL HEADER, with the
161544Seschrock * fields enclosed by brackets "[]" replaced with your own identifying
171544Seschrock * information: Portions Copyright [yyyy] [name of copyright owner]
181544Seschrock *
191544Seschrock * CDDL HEADER END
201544Seschrock */
211544Seschrock /*
229425SEric.Schrock@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
231544Seschrock * Use is subject to license terms.
241544Seschrock */
251544Seschrock
261544Seschrock #include <sys/spa.h>
271544Seschrock #include <sys/spa_impl.h>
281544Seschrock #include <sys/vdev.h>
291544Seschrock #include <sys/vdev_impl.h>
301544Seschrock #include <sys/zio.h>
3110614SJonathan.Adams@Sun.COM #include <sys/zio_checksum.h>
321544Seschrock
331544Seschrock #include <sys/fm/fs/zfs.h>
341544Seschrock #include <sys/fm/protocol.h>
351544Seschrock #include <sys/fm/util.h>
361544Seschrock #include <sys/sysevent.h>
371544Seschrock
381544Seschrock /*
391544Seschrock * This general routine is responsible for generating all the different ZFS
401544Seschrock * ereports. The payload is dependent on the class, and which arguments are
411544Seschrock * supplied to the function:
421544Seschrock *
431544Seschrock * EREPORT POOL VDEV IO
441544Seschrock * block X X X
451544Seschrock * data X X
461544Seschrock * device X X
471544Seschrock * pool X
481544Seschrock *
491544Seschrock * If we are in a loading state, all errors are chained together by the same
506523Sek110237 * SPA-wide ENA (Error Numeric Association).
511544Seschrock *
521544Seschrock * For isolated I/O requests, we get the ENA from the zio_t. The propagation
531544Seschrock * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
541544Seschrock * to chain together all ereports associated with a logical piece of data. For
551544Seschrock * read I/Os, there are basically three 'types' of I/O, which form a roughly
561544Seschrock * layered diagram:
571544Seschrock *
581544Seschrock * +---------------+
591544Seschrock * | Aggregate I/O | No associated logical data or device
601544Seschrock * +---------------+
611544Seschrock * |
621544Seschrock * V
631544Seschrock * +---------------+ Reads associated with a piece of logical data.
641544Seschrock * | Read I/O | This includes reads on behalf of RAID-Z,
651544Seschrock * +---------------+ mirrors, gang blocks, retries, etc.
661544Seschrock * |
671544Seschrock * V
681544Seschrock * +---------------+ Reads associated with a particular device, but
691544Seschrock * | Physical I/O | no logical data. Issued as part of vdev caching
701544Seschrock * +---------------+ and I/O aggregation.
711544Seschrock *
721544Seschrock * Note that 'physical I/O' here is not the same terminology as used in the rest
731544Seschrock * of ZIO. Typically, 'physical I/O' simply means that there is no attached
741544Seschrock * blockpointer. But I/O with no associated block pointer can still be related
751544Seschrock * to a logical piece of data (i.e. RAID-Z requests).
761544Seschrock *
771544Seschrock * Purely physical I/O always have unique ENAs. They are not related to a
781544Seschrock * particular piece of logical data, and therefore cannot be chained together.
791544Seschrock * We still generate an ereport, but the DE doesn't correlate it with any
801544Seschrock * logical piece of data. When such an I/O fails, the delegated I/O requests
811544Seschrock * will issue a retry, which will trigger the 'real' ereport with the correct
821544Seschrock * ENA.
831544Seschrock *
841544Seschrock * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
851544Seschrock * When a new logical I/O is issued, we set this to point to itself. Child I/Os
861544Seschrock * then inherit this pointer, so that when it is first set subsequent failures
877754SJeff.Bonwick@Sun.COM * will use the same ENA. For vdev cache fill and queue aggregation I/O,
887754SJeff.Bonwick@Sun.COM * this pointer is set to NULL, and no ereport will be generated (since it
897754SJeff.Bonwick@Sun.COM * doesn't actually correspond to any particular device or piece of data,
907754SJeff.Bonwick@Sun.COM * and the caller will always retry without caching or queueing anyway).
9110614SJonathan.Adams@Sun.COM *
9210614SJonathan.Adams@Sun.COM * For checksum errors, we want to include more information about the actual
9310614SJonathan.Adams@Sun.COM * error which occurs. Accordingly, we build an ereport when the error is
9410614SJonathan.Adams@Sun.COM * noticed, but instead of sending it in immediately, we hang it off of the
9510614SJonathan.Adams@Sun.COM * io_cksum_report field of the logical IO. When the logical IO completes
9610614SJonathan.Adams@Sun.COM * (successfully or not), zfs_ereport_finish_checksum() is called with the
9710614SJonathan.Adams@Sun.COM * good and bad versions of the buffer (if available), and we annotate the
9810614SJonathan.Adams@Sun.COM * ereport with information about the differences.
991544Seschrock */
10010614SJonathan.Adams@Sun.COM #ifdef _KERNEL
10110614SJonathan.Adams@Sun.COM static void
zfs_ereport_start(nvlist_t ** ereport_out,nvlist_t ** detector_out,const char * subclass,spa_t * spa,vdev_t * vd,zio_t * zio,uint64_t stateoroffset,uint64_t size)10210614SJonathan.Adams@Sun.COM zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
10310614SJonathan.Adams@Sun.COM const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
1041544Seschrock uint64_t stateoroffset, uint64_t size)
1051544Seschrock {
1061544Seschrock nvlist_t *ereport, *detector;
10710614SJonathan.Adams@Sun.COM
1081544Seschrock uint64_t ena;
1091544Seschrock char class[64];
1101544Seschrock
1111544Seschrock /*
11210921STim.Haley@Sun.COM * If we are doing a spa_tryimport() or in recovery mode,
11310921STim.Haley@Sun.COM * ignore errors.
1141544Seschrock */
115*11147SGeorge.Wilson@Sun.COM if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
116*11147SGeorge.Wilson@Sun.COM spa_load_state(spa) == SPA_LOAD_RECOVER)
1171544Seschrock return;
1181544Seschrock
1191544Seschrock /*
1201544Seschrock * If we are in the middle of opening a pool, and the previous attempt
1211544Seschrock * failed, don't bother logging any new ereports - we're just going to
1221544Seschrock * get the same diagnosis anyway.
1231544Seschrock */
124*11147SGeorge.Wilson@Sun.COM if (spa_load_state(spa) != SPA_LOAD_NONE &&
1251544Seschrock spa->spa_last_open_failed)
1261544Seschrock return;
1271544Seschrock
1286673Seschrock if (zio != NULL) {
1296673Seschrock /*
1306673Seschrock * If this is not a read or write zio, ignore the error. This
1316673Seschrock * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
1326673Seschrock */
1336673Seschrock if (zio->io_type != ZIO_TYPE_READ &&
1346673Seschrock zio->io_type != ZIO_TYPE_WRITE)
1356673Seschrock return;
1366673Seschrock
1376673Seschrock /*
1386673Seschrock * Ignore any errors from speculative I/Os, as failure is an
1396673Seschrock * expected result.
1406673Seschrock */
1416673Seschrock if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
1426673Seschrock return;
1436976Seschrock
1449725SEric.Schrock@Sun.COM /*
1459725SEric.Schrock@Sun.COM * If this I/O is not a retry I/O, don't post an ereport.
1469725SEric.Schrock@Sun.COM * Otherwise, we risk making bad diagnoses based on B_FAILFAST
1479725SEric.Schrock@Sun.COM * I/Os.
1489725SEric.Schrock@Sun.COM */
1499725SEric.Schrock@Sun.COM if (zio->io_error == EIO &&
1509725SEric.Schrock@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_RETRY))
1519725SEric.Schrock@Sun.COM return;
1529725SEric.Schrock@Sun.COM
1539425SEric.Schrock@Sun.COM if (vd != NULL) {
1549425SEric.Schrock@Sun.COM /*
1559425SEric.Schrock@Sun.COM * If the vdev has already been marked as failing due
1569425SEric.Schrock@Sun.COM * to a failed probe, then ignore any subsequent I/O
1579425SEric.Schrock@Sun.COM * errors, as the DE will automatically fault the vdev
1589425SEric.Schrock@Sun.COM * on the first such failure. This also catches cases
1599425SEric.Schrock@Sun.COM * where vdev_remove_wanted is set and the device has
1609425SEric.Schrock@Sun.COM * not yet been asynchronously placed into the REMOVED
1619425SEric.Schrock@Sun.COM * state.
1629425SEric.Schrock@Sun.COM */
16310575SEric.Schrock@Sun.COM if (zio->io_vd == vd && !vdev_accessible(vd, zio))
1649425SEric.Schrock@Sun.COM return;
1659425SEric.Schrock@Sun.COM
1669425SEric.Schrock@Sun.COM /*
1679425SEric.Schrock@Sun.COM * Ignore checksum errors for reads from DTL regions of
1689425SEric.Schrock@Sun.COM * leaf vdevs.
1699425SEric.Schrock@Sun.COM */
1709425SEric.Schrock@Sun.COM if (zio->io_type == ZIO_TYPE_READ &&
1719425SEric.Schrock@Sun.COM zio->io_error == ECKSUM &&
1729425SEric.Schrock@Sun.COM vd->vdev_ops->vdev_op_leaf &&
1739425SEric.Schrock@Sun.COM vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
1749425SEric.Schrock@Sun.COM return;
1759425SEric.Schrock@Sun.COM }
1766673Seschrock }
1771773Seschrock
17810575SEric.Schrock@Sun.COM /*
17910575SEric.Schrock@Sun.COM * For probe failure, we want to avoid posting ereports if we've
18010575SEric.Schrock@Sun.COM * already removed the device in the meantime.
18110575SEric.Schrock@Sun.COM */
18210575SEric.Schrock@Sun.COM if (vd != NULL &&
18310575SEric.Schrock@Sun.COM strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
18410575SEric.Schrock@Sun.COM (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
18510575SEric.Schrock@Sun.COM return;
18610575SEric.Schrock@Sun.COM
1871544Seschrock if ((ereport = fm_nvlist_create(NULL)) == NULL)
1881544Seschrock return;
1891544Seschrock
1901544Seschrock if ((detector = fm_nvlist_create(NULL)) == NULL) {
1911544Seschrock fm_nvlist_destroy(ereport, FM_NVA_FREE);
1921544Seschrock return;
1931544Seschrock }
1941544Seschrock
1951544Seschrock /*
1961544Seschrock * Serialize ereport generation
1971544Seschrock */
1981544Seschrock mutex_enter(&spa->spa_errlist_lock);
1991544Seschrock
2001544Seschrock /*
2011544Seschrock * Determine the ENA to use for this event. If we are in a loading
2021544Seschrock * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
2031544Seschrock * a root zio-wide ENA. Otherwise, simply use a unique ENA.
2041544Seschrock */
205*11147SGeorge.Wilson@Sun.COM if (spa_load_state(spa) != SPA_LOAD_NONE) {
2061544Seschrock if (spa->spa_ena == 0)
2071544Seschrock spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
2081544Seschrock ena = spa->spa_ena;
2091544Seschrock } else if (zio != NULL && zio->io_logical != NULL) {
2101544Seschrock if (zio->io_logical->io_ena == 0)
2111544Seschrock zio->io_logical->io_ena =
2121544Seschrock fm_ena_generate(0, FM_ENA_FMT1);
2131544Seschrock ena = zio->io_logical->io_ena;
2141544Seschrock } else {
2151544Seschrock ena = fm_ena_generate(0, FM_ENA_FMT1);
2161544Seschrock }
2171544Seschrock
2181544Seschrock /*
2191544Seschrock * Construct the full class, detector, and other standard FMA fields.
2201544Seschrock */
2211544Seschrock (void) snprintf(class, sizeof (class), "%s.%s",
2221544Seschrock ZFS_ERROR_CLASS, subclass);
2231544Seschrock
2241544Seschrock fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
2251544Seschrock vd != NULL ? vd->vdev_guid : 0);
2261544Seschrock
2271544Seschrock fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
2281544Seschrock
2291544Seschrock /*
2301544Seschrock * Construct the per-ereport payload, depending on which parameters are
2311544Seschrock * passed in.
2321544Seschrock */
2331544Seschrock
2341544Seschrock /*
2351544Seschrock * Generic payload members common to all ereports.
2361544Seschrock */
2371544Seschrock fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
2387754SJeff.Bonwick@Sun.COM DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
2391544Seschrock DATA_TYPE_UINT64, spa_guid(spa),
2401544Seschrock FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
241*11147SGeorge.Wilson@Sun.COM spa_load_state(spa), NULL);
2421544Seschrock
2436523Sek110237 if (spa != NULL) {
2446523Sek110237 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
2456523Sek110237 DATA_TYPE_STRING,
2466523Sek110237 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
2476523Sek110237 FM_EREPORT_FAILMODE_WAIT :
2486523Sek110237 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
2496523Sek110237 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
2506523Sek110237 NULL);
2516523Sek110237 }
2526523Sek110237
2531544Seschrock if (vd != NULL) {
2541544Seschrock vdev_t *pvd = vd->vdev_parent;
2551544Seschrock
2561544Seschrock fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
2571544Seschrock DATA_TYPE_UINT64, vd->vdev_guid,
2581544Seschrock FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
2591544Seschrock DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
2609425SEric.Schrock@Sun.COM if (vd->vdev_path != NULL)
2611544Seschrock fm_payload_set(ereport,
2621544Seschrock FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
2631544Seschrock DATA_TYPE_STRING, vd->vdev_path, NULL);
2649425SEric.Schrock@Sun.COM if (vd->vdev_devid != NULL)
2651544Seschrock fm_payload_set(ereport,
2661544Seschrock FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
2671544Seschrock DATA_TYPE_STRING, vd->vdev_devid, NULL);
2689425SEric.Schrock@Sun.COM if (vd->vdev_fru != NULL)
2699425SEric.Schrock@Sun.COM fm_payload_set(ereport,
2709425SEric.Schrock@Sun.COM FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
2719425SEric.Schrock@Sun.COM DATA_TYPE_STRING, vd->vdev_fru, NULL);
2721544Seschrock
2731544Seschrock if (pvd != NULL) {
2741544Seschrock fm_payload_set(ereport,
2751544Seschrock FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
2761544Seschrock DATA_TYPE_UINT64, pvd->vdev_guid,
2771544Seschrock FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
2781544Seschrock DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
2791544Seschrock NULL);
2801544Seschrock if (pvd->vdev_path)
2811544Seschrock fm_payload_set(ereport,
2821544Seschrock FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
2834831Sgw25295 DATA_TYPE_STRING, pvd->vdev_path, NULL);
2841544Seschrock if (pvd->vdev_devid)
2851544Seschrock fm_payload_set(ereport,
2861544Seschrock FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
2871544Seschrock DATA_TYPE_STRING, pvd->vdev_devid, NULL);
2881544Seschrock }
2891544Seschrock }
2901544Seschrock
2911544Seschrock if (zio != NULL) {
2921544Seschrock /*
2931544Seschrock * Payload common to all I/Os.
2941544Seschrock */
2951544Seschrock fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
2961544Seschrock DATA_TYPE_INT32, zio->io_error, NULL);
2971544Seschrock
2981544Seschrock /*
2991544Seschrock * If the 'size' parameter is non-zero, it indicates this is a
3001544Seschrock * RAID-Z or other I/O where the physical offset and length are
3011544Seschrock * provided for us, instead of within the zio_t.
3021544Seschrock */
3031544Seschrock if (vd != NULL) {
3041544Seschrock if (size)
3051544Seschrock fm_payload_set(ereport,
3061544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
3071544Seschrock DATA_TYPE_UINT64, stateoroffset,
3081544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
3091955Seschrock DATA_TYPE_UINT64, size, NULL);
3101544Seschrock else
3111544Seschrock fm_payload_set(ereport,
3121544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
3131544Seschrock DATA_TYPE_UINT64, zio->io_offset,
3141544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
3151955Seschrock DATA_TYPE_UINT64, zio->io_size, NULL);
3161544Seschrock }
3171544Seschrock
3181544Seschrock /*
3191544Seschrock * Payload for I/Os with corresponding logical information.
3201544Seschrock */
3211544Seschrock if (zio->io_logical != NULL)
3221544Seschrock fm_payload_set(ereport,
3236423Sgw25295 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
3246423Sgw25295 DATA_TYPE_UINT64,
3256423Sgw25295 zio->io_logical->io_bookmark.zb_objset,
3261544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
3271544Seschrock DATA_TYPE_UINT64,
3281544Seschrock zio->io_logical->io_bookmark.zb_object,
3291544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
3304831Sgw25295 DATA_TYPE_INT64,
3311544Seschrock zio->io_logical->io_bookmark.zb_level,
3321544Seschrock FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
3331544Seschrock DATA_TYPE_UINT64,
3341955Seschrock zio->io_logical->io_bookmark.zb_blkid, NULL);
3351544Seschrock } else if (vd != NULL) {
3361544Seschrock /*
3371544Seschrock * If we have a vdev but no zio, this is a device fault, and the
3381544Seschrock * 'stateoroffset' parameter indicates the previous state of the
3391544Seschrock * vdev.
3401544Seschrock */
3411544Seschrock fm_payload_set(ereport,
3421544Seschrock FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
3431544Seschrock DATA_TYPE_UINT64, stateoroffset, NULL);
3441544Seschrock }
34510921STim.Haley@Sun.COM
3461544Seschrock mutex_exit(&spa->spa_errlist_lock);
3471544Seschrock
34810614SJonathan.Adams@Sun.COM *ereport_out = ereport;
34910614SJonathan.Adams@Sun.COM *detector_out = detector;
35010614SJonathan.Adams@Sun.COM }
35110614SJonathan.Adams@Sun.COM
35210614SJonathan.Adams@Sun.COM /* if it's <= 128 bytes, save the corruption directly */
35310614SJonathan.Adams@Sun.COM #define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
35410614SJonathan.Adams@Sun.COM
35510614SJonathan.Adams@Sun.COM #define MAX_RANGES 16
35610614SJonathan.Adams@Sun.COM
35710614SJonathan.Adams@Sun.COM typedef struct zfs_ecksum_info {
35810614SJonathan.Adams@Sun.COM /* histograms of set and cleared bits by bit number in a 64-bit word */
35910614SJonathan.Adams@Sun.COM uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
36010614SJonathan.Adams@Sun.COM uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
36110614SJonathan.Adams@Sun.COM
36210614SJonathan.Adams@Sun.COM /* inline arrays of bits set and cleared. */
36310614SJonathan.Adams@Sun.COM uint64_t zei_bits_set[ZFM_MAX_INLINE];
36410614SJonathan.Adams@Sun.COM uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
36510614SJonathan.Adams@Sun.COM
36610614SJonathan.Adams@Sun.COM /*
36710614SJonathan.Adams@Sun.COM * for each range, the number of bits set and cleared. The Hamming
36810614SJonathan.Adams@Sun.COM * distance between the good and bad buffers is the sum of them all.
36910614SJonathan.Adams@Sun.COM */
37010614SJonathan.Adams@Sun.COM uint32_t zei_range_sets[MAX_RANGES];
37110614SJonathan.Adams@Sun.COM uint32_t zei_range_clears[MAX_RANGES];
37210614SJonathan.Adams@Sun.COM
37310614SJonathan.Adams@Sun.COM struct zei_ranges {
37410614SJonathan.Adams@Sun.COM uint32_t zr_start;
37510614SJonathan.Adams@Sun.COM uint32_t zr_end;
37610614SJonathan.Adams@Sun.COM } zei_ranges[MAX_RANGES];
37710614SJonathan.Adams@Sun.COM
37810614SJonathan.Adams@Sun.COM size_t zei_range_count;
37910614SJonathan.Adams@Sun.COM uint32_t zei_mingap;
38010614SJonathan.Adams@Sun.COM uint32_t zei_allowed_mingap;
38110614SJonathan.Adams@Sun.COM
38210614SJonathan.Adams@Sun.COM } zfs_ecksum_info_t;
38310614SJonathan.Adams@Sun.COM
38410614SJonathan.Adams@Sun.COM static void
update_histogram(uint64_t value_arg,uint16_t * hist,uint32_t * count)38510614SJonathan.Adams@Sun.COM update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
38610614SJonathan.Adams@Sun.COM {
38710614SJonathan.Adams@Sun.COM size_t i;
38810614SJonathan.Adams@Sun.COM size_t bits = 0;
38910614SJonathan.Adams@Sun.COM uint64_t value = BE_64(value_arg);
39010614SJonathan.Adams@Sun.COM
39110614SJonathan.Adams@Sun.COM /* We store the bits in big-endian (largest-first) order */
39210614SJonathan.Adams@Sun.COM for (i = 0; i < 64; i++) {
39310614SJonathan.Adams@Sun.COM if (value & (1ull << i)) {
39410614SJonathan.Adams@Sun.COM hist[63 - i]++;
39510614SJonathan.Adams@Sun.COM ++bits;
39610614SJonathan.Adams@Sun.COM }
39710614SJonathan.Adams@Sun.COM }
39810614SJonathan.Adams@Sun.COM /* update the count of bits changed */
39910614SJonathan.Adams@Sun.COM *count += bits;
40010614SJonathan.Adams@Sun.COM }
40110614SJonathan.Adams@Sun.COM
40210614SJonathan.Adams@Sun.COM /*
40310614SJonathan.Adams@Sun.COM * We've now filled up the range array, and need to increase "mingap" and
40410614SJonathan.Adams@Sun.COM * shrink the range list accordingly. zei_mingap is always the smallest
40510614SJonathan.Adams@Sun.COM * distance between array entries, so we set the new_allowed_gap to be
40610614SJonathan.Adams@Sun.COM * one greater than that. We then go through the list, joining together
40710614SJonathan.Adams@Sun.COM * any ranges which are closer than the new_allowed_gap.
40810614SJonathan.Adams@Sun.COM *
40910614SJonathan.Adams@Sun.COM * By construction, there will be at least one. We also update zei_mingap
41010614SJonathan.Adams@Sun.COM * to the new smallest gap, to prepare for our next invocation.
41110614SJonathan.Adams@Sun.COM */
41210614SJonathan.Adams@Sun.COM static void
shrink_ranges(zfs_ecksum_info_t * eip)41310614SJonathan.Adams@Sun.COM shrink_ranges(zfs_ecksum_info_t *eip)
41410614SJonathan.Adams@Sun.COM {
41510614SJonathan.Adams@Sun.COM uint32_t mingap = UINT32_MAX;
41610614SJonathan.Adams@Sun.COM uint32_t new_allowed_gap = eip->zei_mingap + 1;
41710614SJonathan.Adams@Sun.COM
41810614SJonathan.Adams@Sun.COM size_t idx, output;
41910614SJonathan.Adams@Sun.COM size_t max = eip->zei_range_count;
42010614SJonathan.Adams@Sun.COM
42110614SJonathan.Adams@Sun.COM struct zei_ranges *r = eip->zei_ranges;
42210614SJonathan.Adams@Sun.COM
42310614SJonathan.Adams@Sun.COM ASSERT3U(eip->zei_range_count, >, 0);
42410614SJonathan.Adams@Sun.COM ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
42510614SJonathan.Adams@Sun.COM
42610614SJonathan.Adams@Sun.COM output = idx = 0;
42710614SJonathan.Adams@Sun.COM while (idx < max - 1) {
42810614SJonathan.Adams@Sun.COM uint32_t start = r[idx].zr_start;
42910614SJonathan.Adams@Sun.COM uint32_t end = r[idx].zr_end;
43010614SJonathan.Adams@Sun.COM
43110614SJonathan.Adams@Sun.COM while (idx < max - 1) {
43210614SJonathan.Adams@Sun.COM idx++;
43310614SJonathan.Adams@Sun.COM
43410614SJonathan.Adams@Sun.COM uint32_t nstart = r[idx].zr_start;
43510614SJonathan.Adams@Sun.COM uint32_t nend = r[idx].zr_end;
43610614SJonathan.Adams@Sun.COM
43710614SJonathan.Adams@Sun.COM uint32_t gap = nstart - end;
43810614SJonathan.Adams@Sun.COM if (gap < new_allowed_gap) {
43910614SJonathan.Adams@Sun.COM end = nend;
44010614SJonathan.Adams@Sun.COM continue;
44110614SJonathan.Adams@Sun.COM }
44210614SJonathan.Adams@Sun.COM if (gap < mingap)
44310614SJonathan.Adams@Sun.COM mingap = gap;
44410614SJonathan.Adams@Sun.COM break;
44510614SJonathan.Adams@Sun.COM }
44610614SJonathan.Adams@Sun.COM r[output].zr_start = start;
44710614SJonathan.Adams@Sun.COM r[output].zr_end = end;
44810614SJonathan.Adams@Sun.COM output++;
44910614SJonathan.Adams@Sun.COM }
45010614SJonathan.Adams@Sun.COM ASSERT3U(output, <, eip->zei_range_count);
45110614SJonathan.Adams@Sun.COM eip->zei_range_count = output;
45210614SJonathan.Adams@Sun.COM eip->zei_mingap = mingap;
45310614SJonathan.Adams@Sun.COM eip->zei_allowed_mingap = new_allowed_gap;
45410614SJonathan.Adams@Sun.COM }
45510614SJonathan.Adams@Sun.COM
45610614SJonathan.Adams@Sun.COM static void
add_range(zfs_ecksum_info_t * eip,int start,int end)45710614SJonathan.Adams@Sun.COM add_range(zfs_ecksum_info_t *eip, int start, int end)
45810614SJonathan.Adams@Sun.COM {
45910614SJonathan.Adams@Sun.COM struct zei_ranges *r = eip->zei_ranges;
46010614SJonathan.Adams@Sun.COM size_t count = eip->zei_range_count;
46110614SJonathan.Adams@Sun.COM
46210614SJonathan.Adams@Sun.COM if (count >= MAX_RANGES) {
46310614SJonathan.Adams@Sun.COM shrink_ranges(eip);
46410614SJonathan.Adams@Sun.COM count = eip->zei_range_count;
46510614SJonathan.Adams@Sun.COM }
46610614SJonathan.Adams@Sun.COM if (count == 0) {
46710614SJonathan.Adams@Sun.COM eip->zei_mingap = UINT32_MAX;
46810614SJonathan.Adams@Sun.COM eip->zei_allowed_mingap = 1;
46910614SJonathan.Adams@Sun.COM } else {
47010614SJonathan.Adams@Sun.COM int gap = start - r[count - 1].zr_end;
47110614SJonathan.Adams@Sun.COM
47210614SJonathan.Adams@Sun.COM if (gap < eip->zei_allowed_mingap) {
47310614SJonathan.Adams@Sun.COM r[count - 1].zr_end = end;
47410614SJonathan.Adams@Sun.COM return;
47510614SJonathan.Adams@Sun.COM }
47610614SJonathan.Adams@Sun.COM if (gap < eip->zei_mingap)
47710614SJonathan.Adams@Sun.COM eip->zei_mingap = gap;
47810614SJonathan.Adams@Sun.COM }
47910614SJonathan.Adams@Sun.COM r[count].zr_start = start;
48010614SJonathan.Adams@Sun.COM r[count].zr_end = end;
48110614SJonathan.Adams@Sun.COM eip->zei_range_count++;
48210614SJonathan.Adams@Sun.COM }
48310614SJonathan.Adams@Sun.COM
48410614SJonathan.Adams@Sun.COM static size_t
range_total_size(zfs_ecksum_info_t * eip)48510614SJonathan.Adams@Sun.COM range_total_size(zfs_ecksum_info_t *eip)
48610614SJonathan.Adams@Sun.COM {
48710614SJonathan.Adams@Sun.COM struct zei_ranges *r = eip->zei_ranges;
48810614SJonathan.Adams@Sun.COM size_t count = eip->zei_range_count;
48910614SJonathan.Adams@Sun.COM size_t result = 0;
49010614SJonathan.Adams@Sun.COM size_t idx;
49110614SJonathan.Adams@Sun.COM
49210614SJonathan.Adams@Sun.COM for (idx = 0; idx < count; idx++)
49310614SJonathan.Adams@Sun.COM result += (r[idx].zr_end - r[idx].zr_start);
49410614SJonathan.Adams@Sun.COM
49510614SJonathan.Adams@Sun.COM return (result);
49610614SJonathan.Adams@Sun.COM }
49710614SJonathan.Adams@Sun.COM
49810614SJonathan.Adams@Sun.COM static zfs_ecksum_info_t *
annotate_ecksum(nvlist_t * ereport,zio_bad_cksum_t * info,const uint8_t * goodbuf,const uint8_t * badbuf,size_t size,boolean_t drop_if_identical)49910614SJonathan.Adams@Sun.COM annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
50010614SJonathan.Adams@Sun.COM const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
50110614SJonathan.Adams@Sun.COM boolean_t drop_if_identical)
50210614SJonathan.Adams@Sun.COM {
50310614SJonathan.Adams@Sun.COM const uint64_t *good = (const uint64_t *)goodbuf;
50410614SJonathan.Adams@Sun.COM const uint64_t *bad = (const uint64_t *)badbuf;
50510614SJonathan.Adams@Sun.COM
50610614SJonathan.Adams@Sun.COM uint64_t allset = 0;
50710614SJonathan.Adams@Sun.COM uint64_t allcleared = 0;
50810614SJonathan.Adams@Sun.COM
50910614SJonathan.Adams@Sun.COM size_t nui64s = size / sizeof (uint64_t);
51010614SJonathan.Adams@Sun.COM
51110614SJonathan.Adams@Sun.COM size_t inline_size;
51210614SJonathan.Adams@Sun.COM int no_inline = 0;
51310614SJonathan.Adams@Sun.COM size_t idx;
51410614SJonathan.Adams@Sun.COM size_t range;
51510614SJonathan.Adams@Sun.COM
51610614SJonathan.Adams@Sun.COM size_t offset = 0;
51710614SJonathan.Adams@Sun.COM ssize_t start = -1;
51810614SJonathan.Adams@Sun.COM
51910614SJonathan.Adams@Sun.COM zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
52010614SJonathan.Adams@Sun.COM
52110614SJonathan.Adams@Sun.COM /* don't do any annotation for injected checksum errors */
52210614SJonathan.Adams@Sun.COM if (info != NULL && info->zbc_injected)
52310614SJonathan.Adams@Sun.COM return (eip);
52410614SJonathan.Adams@Sun.COM
52510614SJonathan.Adams@Sun.COM if (info != NULL && info->zbc_has_cksum) {
52610614SJonathan.Adams@Sun.COM fm_payload_set(ereport,
52710614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
52810614SJonathan.Adams@Sun.COM DATA_TYPE_UINT64_ARRAY,
52910614SJonathan.Adams@Sun.COM sizeof (info->zbc_expected) / sizeof (uint64_t),
53010614SJonathan.Adams@Sun.COM (uint64_t *)&info->zbc_expected,
53110614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
53210614SJonathan.Adams@Sun.COM DATA_TYPE_UINT64_ARRAY,
53310614SJonathan.Adams@Sun.COM sizeof (info->zbc_actual) / sizeof (uint64_t),
53410614SJonathan.Adams@Sun.COM (uint64_t *)&info->zbc_actual,
53510614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
53610614SJonathan.Adams@Sun.COM DATA_TYPE_STRING,
53710614SJonathan.Adams@Sun.COM info->zbc_checksum_name,
53810614SJonathan.Adams@Sun.COM NULL);
53910614SJonathan.Adams@Sun.COM
54010614SJonathan.Adams@Sun.COM if (info->zbc_byteswapped) {
54110614SJonathan.Adams@Sun.COM fm_payload_set(ereport,
54210614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
54310614SJonathan.Adams@Sun.COM DATA_TYPE_BOOLEAN, 1,
54410614SJonathan.Adams@Sun.COM NULL);
54510614SJonathan.Adams@Sun.COM }
54610614SJonathan.Adams@Sun.COM }
54710614SJonathan.Adams@Sun.COM
54810614SJonathan.Adams@Sun.COM if (badbuf == NULL || goodbuf == NULL)
54910614SJonathan.Adams@Sun.COM return (eip);
55010614SJonathan.Adams@Sun.COM
55110614SJonathan.Adams@Sun.COM ASSERT3U(nui64s, <=, UINT16_MAX);
55210614SJonathan.Adams@Sun.COM ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
55310614SJonathan.Adams@Sun.COM ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
55410614SJonathan.Adams@Sun.COM ASSERT3U(size, <=, UINT32_MAX);
55510614SJonathan.Adams@Sun.COM
55610614SJonathan.Adams@Sun.COM /* build up the range list by comparing the two buffers. */
55710614SJonathan.Adams@Sun.COM for (idx = 0; idx < nui64s; idx++) {
55810614SJonathan.Adams@Sun.COM if (good[idx] == bad[idx]) {
55910614SJonathan.Adams@Sun.COM if (start == -1)
56010614SJonathan.Adams@Sun.COM continue;
56110614SJonathan.Adams@Sun.COM
56210614SJonathan.Adams@Sun.COM add_range(eip, start, idx);
56310614SJonathan.Adams@Sun.COM start = -1;
56410614SJonathan.Adams@Sun.COM } else {
56510614SJonathan.Adams@Sun.COM if (start != -1)
56610614SJonathan.Adams@Sun.COM continue;
56710614SJonathan.Adams@Sun.COM
56810614SJonathan.Adams@Sun.COM start = idx;
56910614SJonathan.Adams@Sun.COM }
57010614SJonathan.Adams@Sun.COM }
57110614SJonathan.Adams@Sun.COM if (start != -1)
57210614SJonathan.Adams@Sun.COM add_range(eip, start, idx);
57310614SJonathan.Adams@Sun.COM
57410614SJonathan.Adams@Sun.COM /* See if it will fit in our inline buffers */
57510614SJonathan.Adams@Sun.COM inline_size = range_total_size(eip);
57610614SJonathan.Adams@Sun.COM if (inline_size > ZFM_MAX_INLINE)
57710614SJonathan.Adams@Sun.COM no_inline = 1;
57810614SJonathan.Adams@Sun.COM
57910614SJonathan.Adams@Sun.COM /*
58010614SJonathan.Adams@Sun.COM * If there is no change and we want to drop if the buffers are
58110614SJonathan.Adams@Sun.COM * identical, do so.
58210614SJonathan.Adams@Sun.COM */
58310614SJonathan.Adams@Sun.COM if (inline_size == 0 && drop_if_identical) {
58410614SJonathan.Adams@Sun.COM kmem_free(eip, sizeof (*eip));
58510614SJonathan.Adams@Sun.COM return (NULL);
58610614SJonathan.Adams@Sun.COM }
58710614SJonathan.Adams@Sun.COM
58810614SJonathan.Adams@Sun.COM /*
58910614SJonathan.Adams@Sun.COM * Now walk through the ranges, filling in the details of the
59010614SJonathan.Adams@Sun.COM * differences. Also convert our uint64_t-array offsets to byte
59110614SJonathan.Adams@Sun.COM * offsets.
59210614SJonathan.Adams@Sun.COM */
59310614SJonathan.Adams@Sun.COM for (range = 0; range < eip->zei_range_count; range++) {
59410614SJonathan.Adams@Sun.COM size_t start = eip->zei_ranges[range].zr_start;
59510614SJonathan.Adams@Sun.COM size_t end = eip->zei_ranges[range].zr_end;
59610614SJonathan.Adams@Sun.COM
59710614SJonathan.Adams@Sun.COM for (idx = start; idx < end; idx++) {
59810614SJonathan.Adams@Sun.COM uint64_t set, cleared;
59910614SJonathan.Adams@Sun.COM
60010614SJonathan.Adams@Sun.COM // bits set in bad, but not in good
60110614SJonathan.Adams@Sun.COM set = ((~good[idx]) & bad[idx]);
60210614SJonathan.Adams@Sun.COM // bits set in good, but not in bad
60310614SJonathan.Adams@Sun.COM cleared = (good[idx] & (~bad[idx]));
60410614SJonathan.Adams@Sun.COM
60510614SJonathan.Adams@Sun.COM allset |= set;
60610614SJonathan.Adams@Sun.COM allcleared |= cleared;
60710614SJonathan.Adams@Sun.COM
60810614SJonathan.Adams@Sun.COM if (!no_inline) {
60910614SJonathan.Adams@Sun.COM ASSERT3U(offset, <, inline_size);
61010614SJonathan.Adams@Sun.COM eip->zei_bits_set[offset] = set;
61110614SJonathan.Adams@Sun.COM eip->zei_bits_cleared[offset] = cleared;
61210614SJonathan.Adams@Sun.COM offset++;
61310614SJonathan.Adams@Sun.COM }
61410614SJonathan.Adams@Sun.COM
61510614SJonathan.Adams@Sun.COM update_histogram(set, eip->zei_histogram_set,
61610614SJonathan.Adams@Sun.COM &eip->zei_range_sets[range]);
61710614SJonathan.Adams@Sun.COM update_histogram(cleared, eip->zei_histogram_cleared,
61810614SJonathan.Adams@Sun.COM &eip->zei_range_clears[range]);
61910614SJonathan.Adams@Sun.COM }
62010614SJonathan.Adams@Sun.COM
62110614SJonathan.Adams@Sun.COM /* convert to byte offsets */
62210614SJonathan.Adams@Sun.COM eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
62310614SJonathan.Adams@Sun.COM eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
62410614SJonathan.Adams@Sun.COM }
62510614SJonathan.Adams@Sun.COM eip->zei_allowed_mingap *= sizeof (uint64_t);
62610614SJonathan.Adams@Sun.COM inline_size *= sizeof (uint64_t);
62710614SJonathan.Adams@Sun.COM
62810614SJonathan.Adams@Sun.COM /* fill in ereport */
62910614SJonathan.Adams@Sun.COM fm_payload_set(ereport,
63010614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
63110614SJonathan.Adams@Sun.COM DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
63210614SJonathan.Adams@Sun.COM (uint32_t *)eip->zei_ranges,
63310614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
63410614SJonathan.Adams@Sun.COM DATA_TYPE_UINT32, eip->zei_allowed_mingap,
63510614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
63610614SJonathan.Adams@Sun.COM DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
63710614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
63810614SJonathan.Adams@Sun.COM DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
63910614SJonathan.Adams@Sun.COM NULL);
64010614SJonathan.Adams@Sun.COM
64110614SJonathan.Adams@Sun.COM if (!no_inline) {
64210614SJonathan.Adams@Sun.COM fm_payload_set(ereport,
64310614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
64410614SJonathan.Adams@Sun.COM DATA_TYPE_UINT8_ARRAY,
64510614SJonathan.Adams@Sun.COM inline_size, (uint8_t *)eip->zei_bits_set,
64610614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
64710614SJonathan.Adams@Sun.COM DATA_TYPE_UINT8_ARRAY,
64810614SJonathan.Adams@Sun.COM inline_size, (uint8_t *)eip->zei_bits_cleared,
64910614SJonathan.Adams@Sun.COM NULL);
65010614SJonathan.Adams@Sun.COM } else {
65110614SJonathan.Adams@Sun.COM fm_payload_set(ereport,
65210614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
65310614SJonathan.Adams@Sun.COM DATA_TYPE_UINT16_ARRAY,
65410614SJonathan.Adams@Sun.COM NBBY * sizeof (uint64_t), eip->zei_histogram_set,
65510614SJonathan.Adams@Sun.COM FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
65610614SJonathan.Adams@Sun.COM DATA_TYPE_UINT16_ARRAY,
65710614SJonathan.Adams@Sun.COM NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
65810614SJonathan.Adams@Sun.COM NULL);
65910614SJonathan.Adams@Sun.COM }
66010614SJonathan.Adams@Sun.COM return (eip);
66110614SJonathan.Adams@Sun.COM }
66210614SJonathan.Adams@Sun.COM #endif
66310614SJonathan.Adams@Sun.COM
66410614SJonathan.Adams@Sun.COM void
zfs_ereport_post(const char * subclass,spa_t * spa,vdev_t * vd,zio_t * zio,uint64_t stateoroffset,uint64_t size)66510614SJonathan.Adams@Sun.COM zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
66610614SJonathan.Adams@Sun.COM uint64_t stateoroffset, uint64_t size)
66710614SJonathan.Adams@Sun.COM {
66810614SJonathan.Adams@Sun.COM #ifdef _KERNEL
66910614SJonathan.Adams@Sun.COM nvlist_t *ereport = NULL;
67010614SJonathan.Adams@Sun.COM nvlist_t *detector = NULL;
67110614SJonathan.Adams@Sun.COM
67210614SJonathan.Adams@Sun.COM zfs_ereport_start(&ereport, &detector,
67310614SJonathan.Adams@Sun.COM subclass, spa, vd, zio, stateoroffset, size);
67410614SJonathan.Adams@Sun.COM
67510614SJonathan.Adams@Sun.COM if (ereport == NULL)
67610614SJonathan.Adams@Sun.COM return;
67710614SJonathan.Adams@Sun.COM
6781544Seschrock fm_ereport_post(ereport, EVCH_SLEEP);
6791544Seschrock
6801544Seschrock fm_nvlist_destroy(ereport, FM_NVA_FREE);
6811544Seschrock fm_nvlist_destroy(detector, FM_NVA_FREE);
6821544Seschrock #endif
6831544Seschrock }
6841544Seschrock
68510614SJonathan.Adams@Sun.COM void
zfs_ereport_start_checksum(spa_t * spa,vdev_t * vd,struct zio * zio,uint64_t offset,uint64_t length,void * arg,zio_bad_cksum_t * info)68610614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
68710614SJonathan.Adams@Sun.COM struct zio *zio, uint64_t offset, uint64_t length, void *arg,
68810614SJonathan.Adams@Sun.COM zio_bad_cksum_t *info)
68910614SJonathan.Adams@Sun.COM {
69010614SJonathan.Adams@Sun.COM zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
69110614SJonathan.Adams@Sun.COM
69210614SJonathan.Adams@Sun.COM if (zio->io_vsd != NULL)
69310614SJonathan.Adams@Sun.COM zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
69410614SJonathan.Adams@Sun.COM else
69510614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_report(zio, report, arg);
69610614SJonathan.Adams@Sun.COM
69710614SJonathan.Adams@Sun.COM /* copy the checksum failure information if it was provided */
69810614SJonathan.Adams@Sun.COM if (info != NULL) {
69910614SJonathan.Adams@Sun.COM report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
70010614SJonathan.Adams@Sun.COM bcopy(info, report->zcr_ckinfo, sizeof (*info));
70110614SJonathan.Adams@Sun.COM }
70210614SJonathan.Adams@Sun.COM
70310922SJeff.Bonwick@Sun.COM report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
70410614SJonathan.Adams@Sun.COM report->zcr_length = length;
70510614SJonathan.Adams@Sun.COM
70610614SJonathan.Adams@Sun.COM #ifdef _KERNEL
70710614SJonathan.Adams@Sun.COM zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
70810614SJonathan.Adams@Sun.COM FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
70910614SJonathan.Adams@Sun.COM
71010614SJonathan.Adams@Sun.COM if (report->zcr_ereport == NULL) {
71110614SJonathan.Adams@Sun.COM report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
71210614SJonathan.Adams@Sun.COM kmem_free(report, sizeof (*report));
71310614SJonathan.Adams@Sun.COM return;
71410614SJonathan.Adams@Sun.COM }
71510614SJonathan.Adams@Sun.COM #endif
71610614SJonathan.Adams@Sun.COM
71710614SJonathan.Adams@Sun.COM mutex_enter(&spa->spa_errlist_lock);
71810614SJonathan.Adams@Sun.COM report->zcr_next = zio->io_logical->io_cksum_report;
71910614SJonathan.Adams@Sun.COM zio->io_logical->io_cksum_report = report;
72010614SJonathan.Adams@Sun.COM mutex_exit(&spa->spa_errlist_lock);
72110614SJonathan.Adams@Sun.COM }
72210614SJonathan.Adams@Sun.COM
72310614SJonathan.Adams@Sun.COM void
zfs_ereport_finish_checksum(zio_cksum_report_t * report,const void * good_data,const void * bad_data,boolean_t drop_if_identical)72410614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zio_cksum_report_t *report,
72510614SJonathan.Adams@Sun.COM const void *good_data, const void *bad_data, boolean_t drop_if_identical)
72610614SJonathan.Adams@Sun.COM {
72710614SJonathan.Adams@Sun.COM #ifdef _KERNEL
72810614SJonathan.Adams@Sun.COM zfs_ecksum_info_t *info = NULL;
72910614SJonathan.Adams@Sun.COM info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
73010614SJonathan.Adams@Sun.COM good_data, bad_data, report->zcr_length, drop_if_identical);
73110614SJonathan.Adams@Sun.COM
73210614SJonathan.Adams@Sun.COM if (info != NULL)
73310614SJonathan.Adams@Sun.COM fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
73410614SJonathan.Adams@Sun.COM
73510614SJonathan.Adams@Sun.COM fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
73610614SJonathan.Adams@Sun.COM fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
73710614SJonathan.Adams@Sun.COM report->zcr_ereport = report->zcr_detector = NULL;
73810614SJonathan.Adams@Sun.COM
73910614SJonathan.Adams@Sun.COM if (info != NULL)
74010614SJonathan.Adams@Sun.COM kmem_free(info, sizeof (*info));
74110614SJonathan.Adams@Sun.COM #endif
74210614SJonathan.Adams@Sun.COM }
74310614SJonathan.Adams@Sun.COM
74410614SJonathan.Adams@Sun.COM void
zfs_ereport_free_checksum(zio_cksum_report_t * rpt)74510614SJonathan.Adams@Sun.COM zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
74610614SJonathan.Adams@Sun.COM {
74710614SJonathan.Adams@Sun.COM #ifdef _KERNEL
74810614SJonathan.Adams@Sun.COM if (rpt->zcr_ereport != NULL) {
74910614SJonathan.Adams@Sun.COM fm_nvlist_destroy(rpt->zcr_ereport,
75010614SJonathan.Adams@Sun.COM FM_NVA_FREE);
75110614SJonathan.Adams@Sun.COM fm_nvlist_destroy(rpt->zcr_detector,
75210614SJonathan.Adams@Sun.COM FM_NVA_FREE);
75310614SJonathan.Adams@Sun.COM }
75410614SJonathan.Adams@Sun.COM #endif
75510614SJonathan.Adams@Sun.COM rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
75610614SJonathan.Adams@Sun.COM
75710614SJonathan.Adams@Sun.COM if (rpt->zcr_ckinfo != NULL)
75810614SJonathan.Adams@Sun.COM kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
75910614SJonathan.Adams@Sun.COM
76010614SJonathan.Adams@Sun.COM kmem_free(rpt, sizeof (*rpt));
76110614SJonathan.Adams@Sun.COM }
76210614SJonathan.Adams@Sun.COM
76310614SJonathan.Adams@Sun.COM void
zfs_ereport_send_interim_checksum(zio_cksum_report_t * report)76410614SJonathan.Adams@Sun.COM zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
76510614SJonathan.Adams@Sun.COM {
76610614SJonathan.Adams@Sun.COM #ifdef _KERNEL
76710614SJonathan.Adams@Sun.COM fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
76810614SJonathan.Adams@Sun.COM #endif
76910614SJonathan.Adams@Sun.COM }
77010614SJonathan.Adams@Sun.COM
77110614SJonathan.Adams@Sun.COM void
zfs_ereport_post_checksum(spa_t * spa,vdev_t * vd,struct zio * zio,uint64_t offset,uint64_t length,const void * good_data,const void * bad_data,zio_bad_cksum_t * zbc)77210614SJonathan.Adams@Sun.COM zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
77310614SJonathan.Adams@Sun.COM struct zio *zio, uint64_t offset, uint64_t length,
77410614SJonathan.Adams@Sun.COM const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
77510614SJonathan.Adams@Sun.COM {
77610614SJonathan.Adams@Sun.COM #ifdef _KERNEL
77710614SJonathan.Adams@Sun.COM nvlist_t *ereport = NULL;
77810614SJonathan.Adams@Sun.COM nvlist_t *detector = NULL;
77910614SJonathan.Adams@Sun.COM zfs_ecksum_info_t *info;
78010614SJonathan.Adams@Sun.COM
78110614SJonathan.Adams@Sun.COM zfs_ereport_start(&ereport, &detector,
78210614SJonathan.Adams@Sun.COM FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
78310614SJonathan.Adams@Sun.COM
78410614SJonathan.Adams@Sun.COM if (ereport == NULL)
78510614SJonathan.Adams@Sun.COM return;
78610614SJonathan.Adams@Sun.COM
78710614SJonathan.Adams@Sun.COM info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
78810614SJonathan.Adams@Sun.COM B_FALSE);
78910614SJonathan.Adams@Sun.COM
79010614SJonathan.Adams@Sun.COM if (info != NULL)
79110614SJonathan.Adams@Sun.COM fm_ereport_post(ereport, EVCH_SLEEP);
79210614SJonathan.Adams@Sun.COM
79310614SJonathan.Adams@Sun.COM fm_nvlist_destroy(ereport, FM_NVA_FREE);
79410614SJonathan.Adams@Sun.COM fm_nvlist_destroy(detector, FM_NVA_FREE);
79510614SJonathan.Adams@Sun.COM
79610614SJonathan.Adams@Sun.COM if (info != NULL)
79710614SJonathan.Adams@Sun.COM kmem_free(info, sizeof (*info));
79810614SJonathan.Adams@Sun.COM #endif
79910614SJonathan.Adams@Sun.COM }
80010614SJonathan.Adams@Sun.COM
8014451Seschrock static void
zfs_post_common(spa_t * spa,vdev_t * vd,const char * name)8024451Seschrock zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
8031544Seschrock {
8041544Seschrock #ifdef _KERNEL
8051544Seschrock nvlist_t *resource;
8061544Seschrock char class[64];
8071544Seschrock
808*11147SGeorge.Wilson@Sun.COM if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
80910575SEric.Schrock@Sun.COM return;
81010575SEric.Schrock@Sun.COM
8111544Seschrock if ((resource = fm_nvlist_create(NULL)) == NULL)
8121544Seschrock return;
8131544Seschrock
8141544Seschrock (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
8154451Seschrock ZFS_ERROR_CLASS, name);
8161544Seschrock VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
8171544Seschrock VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
8181544Seschrock VERIFY(nvlist_add_uint64(resource,
8191544Seschrock FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
8201544Seschrock if (vd)
8211544Seschrock VERIFY(nvlist_add_uint64(resource,
8221544Seschrock FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
8231544Seschrock
8241544Seschrock fm_ereport_post(resource, EVCH_SLEEP);
8251544Seschrock
8261544Seschrock fm_nvlist_destroy(resource, FM_NVA_FREE);
8271544Seschrock #endif
8281544Seschrock }
8294451Seschrock
8304451Seschrock /*
8314451Seschrock * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
8324451Seschrock * has been removed from the system. This will cause the DE to ignore any
8334451Seschrock * recent I/O errors, inferring that they are due to the asynchronous device
8344451Seschrock * removal.
8354451Seschrock */
8364451Seschrock void
zfs_post_remove(spa_t * spa,vdev_t * vd)8374451Seschrock zfs_post_remove(spa_t *spa, vdev_t *vd)
8384451Seschrock {
8394451Seschrock zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
8404451Seschrock }
8414451Seschrock
8424451Seschrock /*
8434451Seschrock * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
8444451Seschrock * has the 'autoreplace' property set, and therefore any broken vdevs will be
8454451Seschrock * handled by higher level logic, and no vdev fault should be generated.
8464451Seschrock */
8474451Seschrock void
zfs_post_autoreplace(spa_t * spa,vdev_t * vd)8484451Seschrock zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
8494451Seschrock {
8504451Seschrock zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
8514451Seschrock }
85210817SEric.Schrock@Sun.COM
85310817SEric.Schrock@Sun.COM /*
85410817SEric.Schrock@Sun.COM * The 'resource.fs.zfs.statechange' event is an internal signal that the
85510817SEric.Schrock@Sun.COM * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
85610817SEric.Schrock@Sun.COM * cause the retire agent to repair any outstanding fault management cases
85710817SEric.Schrock@Sun.COM * open because the device was not found (fault.fs.zfs.device).
85810817SEric.Schrock@Sun.COM */
85910817SEric.Schrock@Sun.COM void
zfs_post_state_change(spa_t * spa,vdev_t * vd)86010817SEric.Schrock@Sun.COM zfs_post_state_change(spa_t *spa, vdev_t *vd)
86110817SEric.Schrock@Sun.COM {
86210817SEric.Schrock@Sun.COM zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
86310817SEric.Schrock@Sun.COM }
864