xref: /freebsd-src/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy 
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
242c48331dSMatt Macy  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26eda14cbcSMatt Macy  */
27eda14cbcSMatt Macy 
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/spa.h>
30e716630dSMartin Matuska #include <sys/spa_impl.h>
31e716630dSMartin Matuska #include <sys/zap.h>
32eda14cbcSMatt Macy #include <sys/vdev_impl.h>
33e716630dSMartin Matuska #include <sys/metaslab_impl.h>
34eda14cbcSMatt Macy #include <sys/zio.h>
35eda14cbcSMatt Macy #include <sys/zio_checksum.h>
36e716630dSMartin Matuska #include <sys/dmu_tx.h>
37eda14cbcSMatt Macy #include <sys/abd.h>
38e716630dSMartin Matuska #include <sys/zfs_rlock.h>
39eda14cbcSMatt Macy #include <sys/fs/zfs.h>
40eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
41eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
42eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
437877fdebSMatt Macy #include <sys/vdev_draid.h>
44e716630dSMartin Matuska #include <sys/uberblock_impl.h>
45e716630dSMartin Matuska #include <sys/dsl_scan.h>
46eda14cbcSMatt Macy 
47eda14cbcSMatt Macy #ifdef ZFS_DEBUG
48eda14cbcSMatt Macy #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
49eda14cbcSMatt Macy #endif
50eda14cbcSMatt Macy 
51eda14cbcSMatt Macy /*
52eda14cbcSMatt Macy  * Virtual device vector for RAID-Z.
53eda14cbcSMatt Macy  *
54eda14cbcSMatt Macy  * This vdev supports single, double, and triple parity. For single parity,
55eda14cbcSMatt Macy  * we use a simple XOR of all the data columns. For double or triple parity,
56eda14cbcSMatt Macy  * we use a special case of Reed-Solomon coding. This extends the
57eda14cbcSMatt Macy  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58eda14cbcSMatt Macy  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59eda14cbcSMatt Macy  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60eda14cbcSMatt Macy  * former is also based. The latter is designed to provide higher performance
61eda14cbcSMatt Macy  * for writes.
62eda14cbcSMatt Macy  *
63eda14cbcSMatt Macy  * Note that the Plank paper claimed to support arbitrary N+M, but was then
64eda14cbcSMatt Macy  * amended six years later identifying a critical flaw that invalidates its
65eda14cbcSMatt Macy  * claims. Nevertheless, the technique can be adapted to work for up to
66eda14cbcSMatt Macy  * triple parity. For additional parity, the amendment "Note: Correction to
67eda14cbcSMatt Macy  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68eda14cbcSMatt Macy  * is viable, but the additional complexity means that write performance will
69eda14cbcSMatt Macy  * suffer.
70eda14cbcSMatt Macy  *
71eda14cbcSMatt Macy  * All of the methods above operate on a Galois field, defined over the
72eda14cbcSMatt Macy  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73eda14cbcSMatt Macy  * can be expressed with a single byte. Briefly, the operations on the
74eda14cbcSMatt Macy  * field are defined as follows:
75eda14cbcSMatt Macy  *
76eda14cbcSMatt Macy  *   o addition (+) is represented by a bitwise XOR
77eda14cbcSMatt Macy  *   o subtraction (-) is therefore identical to addition: A + B = A - B
78eda14cbcSMatt Macy  *   o multiplication of A by 2 is defined by the following bitwise expression:
79eda14cbcSMatt Macy  *
80eda14cbcSMatt Macy  *	(A * 2)_7 = A_6
81eda14cbcSMatt Macy  *	(A * 2)_6 = A_5
82eda14cbcSMatt Macy  *	(A * 2)_5 = A_4
83eda14cbcSMatt Macy  *	(A * 2)_4 = A_3 + A_7
84eda14cbcSMatt Macy  *	(A * 2)_3 = A_2 + A_7
85eda14cbcSMatt Macy  *	(A * 2)_2 = A_1 + A_7
86eda14cbcSMatt Macy  *	(A * 2)_1 = A_0
87eda14cbcSMatt Macy  *	(A * 2)_0 = A_7
88eda14cbcSMatt Macy  *
89eda14cbcSMatt Macy  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90eda14cbcSMatt Macy  * As an aside, this multiplication is derived from the error correcting
91eda14cbcSMatt Macy  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92eda14cbcSMatt Macy  *
93eda14cbcSMatt Macy  * Observe that any number in the field (except for 0) can be expressed as a
94eda14cbcSMatt Macy  * power of 2 -- a generator for the field. We store a table of the powers of
95eda14cbcSMatt Macy  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96eda14cbcSMatt Macy  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97eda14cbcSMatt Macy  * than field addition). The inverse of a field element A (A^-1) is therefore
98eda14cbcSMatt Macy  * A ^ (255 - 1) = A^254.
99eda14cbcSMatt Macy  *
100eda14cbcSMatt Macy  * The up-to-three parity columns, P, Q, R over several data columns,
101eda14cbcSMatt Macy  * D_0, ... D_n-1, can be expressed by field operations:
102eda14cbcSMatt Macy  *
103eda14cbcSMatt Macy  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
104eda14cbcSMatt Macy  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105eda14cbcSMatt Macy  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106eda14cbcSMatt Macy  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107eda14cbcSMatt Macy  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108eda14cbcSMatt Macy  *
109eda14cbcSMatt Macy  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110eda14cbcSMatt Macy  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111eda14cbcSMatt Macy  * independent coefficients. (There are no additional coefficients that have
112eda14cbcSMatt Macy  * this property which is why the uncorrected Plank method breaks down.)
113eda14cbcSMatt Macy  *
114eda14cbcSMatt Macy  * See the reconstruction code below for how P, Q and R can used individually
115eda14cbcSMatt Macy  * or in concert to recover missing data columns.
116eda14cbcSMatt Macy  */
117eda14cbcSMatt Macy 
118eda14cbcSMatt Macy #define	VDEV_RAIDZ_P		0
119eda14cbcSMatt Macy #define	VDEV_RAIDZ_Q		1
120eda14cbcSMatt Macy #define	VDEV_RAIDZ_R		2
121eda14cbcSMatt Macy 
122eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124eda14cbcSMatt Macy 
125eda14cbcSMatt Macy /*
126eda14cbcSMatt Macy  * We provide a mechanism to perform the field multiplication operation on a
127eda14cbcSMatt Macy  * 64-bit value all at once rather than a byte at a time. This works by
128eda14cbcSMatt Macy  * creating a mask from the top bit in each byte and using that to
129eda14cbcSMatt Macy  * conditionally apply the XOR of 0x1d.
130eda14cbcSMatt Macy  */
131eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_2(x, mask) \
132eda14cbcSMatt Macy { \
133eda14cbcSMatt Macy 	(mask) = (x) & 0x8080808080808080ULL; \
134eda14cbcSMatt Macy 	(mask) = ((mask) << 1) - ((mask) >> 7); \
135eda14cbcSMatt Macy 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136eda14cbcSMatt Macy 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137eda14cbcSMatt Macy }
138eda14cbcSMatt Macy 
139eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_4(x, mask) \
140eda14cbcSMatt Macy { \
141eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
142eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
143eda14cbcSMatt Macy }
144eda14cbcSMatt Macy 
145e716630dSMartin Matuska 
146e716630dSMartin Matuska /*
147e716630dSMartin Matuska  * Big Theory Statement for how a RAIDZ VDEV is expanded
148e716630dSMartin Matuska  *
149e716630dSMartin Matuska  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150e716630dSMartin Matuska  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151e716630dSMartin Matuska  * that have been previously expanded can be expanded again.
152e716630dSMartin Matuska  *
153e716630dSMartin Matuska  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154e716630dSMartin Matuska  * the VDEV) when an expansion starts.  And the expansion will pause if any
155e716630dSMartin Matuska  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156e716630dSMartin Matuska  * operations on the pool can continue while an expansion is in progress (e.g.
157e716630dSMartin Matuska  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158e716630dSMartin Matuska  * and zpool initialize which can't be run during an expansion.  Following a
159e716630dSMartin Matuska  * reboot or export/import, the expansion resumes where it left off.
160e716630dSMartin Matuska  *
161e716630dSMartin Matuska  * == Reflowing the Data ==
162e716630dSMartin Matuska  *
163e716630dSMartin Matuska  * The expansion involves reflowing (copying) the data from the current set
164e716630dSMartin Matuska  * of disks to spread it across the new set which now has one more disk. This
165e716630dSMartin Matuska  * reflow operation is similar to reflowing text when the column width of a
166e716630dSMartin Matuska  * text editor window is expanded. The text doesn’t change but the location of
167e716630dSMartin Matuska  * the text changes to accommodate the new width. An example reflow result for
168e716630dSMartin Matuska  * a 4-wide RAIDZ1 to a 5-wide is shown below.
169e716630dSMartin Matuska  *
170e716630dSMartin Matuska  *                            Reflow End State
171e716630dSMartin Matuska  *            Each letter indicates a parity group (logical stripe)
172e716630dSMartin Matuska  *
173e716630dSMartin Matuska  *         Before expansion                         After Expansion
174e716630dSMartin Matuska  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
175e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
176e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
177e716630dSMartin Matuska  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
178e716630dSMartin Matuska  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
179e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
180e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
181e716630dSMartin Matuska  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
182e716630dSMartin Matuska  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
183e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
184e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
185e716630dSMartin Matuska  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
186e716630dSMartin Matuska  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
187e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
188e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
189e716630dSMartin Matuska  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
190e716630dSMartin Matuska  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
191e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
192e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
193e716630dSMartin Matuska  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
194e716630dSMartin Matuska  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
195e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
196e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
197e716630dSMartin Matuska  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
198e716630dSMartin Matuska  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
199e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
200e716630dSMartin Matuska  *  |      |      |      |      |         |      |      |      |      |      |
201e716630dSMartin Matuska  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
202e716630dSMartin Matuska  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
203e716630dSMartin Matuska  *  +------+------+------+------+         +------+------+------+------+------+
204e716630dSMartin Matuska  *
205e716630dSMartin Matuska  * This reflow approach has several advantages. There is no need to read or
206e716630dSMartin Matuska  * modify the block pointers or recompute any block checksums.  The reflow
207e716630dSMartin Matuska  * doesn’t need to know where the parity sectors reside. We can read and write
208e716630dSMartin Matuska  * data sequentially and the copy can occur in a background thread in open
209e716630dSMartin Matuska  * context. The design also allows for fast discovery of what data to copy.
210e716630dSMartin Matuska  *
211e716630dSMartin Matuska  * The VDEV metaslabs are processed, one at a time, to copy the block data to
212e716630dSMartin Matuska  * have it flow across all the disks. The metaslab is disabled for allocations
213e716630dSMartin Matuska  * during the copy. As an optimization, we only copy the allocated data which
214e716630dSMartin Matuska  * can be determined by looking at the metaslab range tree. During the copy we
215e716630dSMartin Matuska  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216e716630dSMartin Matuska  * need to be able to survive losing parity count disks).  This means we
217e716630dSMartin Matuska  * cannot overwrite data during the reflow that would be needed if a disk is
218e716630dSMartin Matuska  * lost.
219e716630dSMartin Matuska  *
220e716630dSMartin Matuska  * After the reflow completes, all newly-written blocks will have the new
221e716630dSMartin Matuska  * layout, i.e., they will have the parity to data ratio implied by the new
222e716630dSMartin Matuska  * number of disks in the RAIDZ group.  Even though the reflow copies all of
223e716630dSMartin Matuska  * the allocated space (data and parity), it is only rearranged, not changed.
224e716630dSMartin Matuska  *
225e716630dSMartin Matuska  * This act of reflowing the data has a few implications about blocks
226e716630dSMartin Matuska  * that were written before the reflow completes:
227e716630dSMartin Matuska  *
228e716630dSMartin Matuska  *  - Old blocks will still use the same amount of space (i.e., they will have
229e716630dSMartin Matuska  *    the parity to data ratio implied by the old number of disks in the RAIDZ
230e716630dSMartin Matuska  *    group).
231e716630dSMartin Matuska  *  - Reading old blocks will be slightly slower than before the reflow, for
232e716630dSMartin Matuska  *    two reasons. First, we will have to read from all disks in the RAIDZ
233e716630dSMartin Matuska  *    VDEV, rather than being able to skip the children that contain only
234e716630dSMartin Matuska  *    parity of this block (because the data of a single block is now spread
235e716630dSMartin Matuska  *    out across all the disks).  Second, in most cases there will be an extra
236e716630dSMartin Matuska  *    bcopy, needed to rearrange the data back to its original layout in memory.
237e716630dSMartin Matuska  *
238e716630dSMartin Matuska  * == Scratch Area ==
239e716630dSMartin Matuska  *
240e716630dSMartin Matuska  * As we copy the block data, we can only progress to the point that writes
241e716630dSMartin Matuska  * will not overlap with blocks whose progress has not yet been recorded on
242e716630dSMartin Matuska  * disk.  Since partially-copied rows are always read from the old location,
243e716630dSMartin Matuska  * we need to stop one row before the sector-wise overlap, to prevent any
244e716630dSMartin Matuska  * row-wise overlap. For example, in the diagram above, when we reflow sector
245e716630dSMartin Matuska  * B6 it will overwite the original location for B5.
246e716630dSMartin Matuska  *
247e716630dSMartin Matuska  * To get around this, a scratch space is used so that we can start copying
248e716630dSMartin Matuska  * without risking data loss by overlapping the row. As an added benefit, it
249e716630dSMartin Matuska  * improves performance at the beginning of the reflow, but that small perf
250e716630dSMartin Matuska  * boost wouldn't be worth the complexity on its own.
251e716630dSMartin Matuska  *
252e716630dSMartin Matuska  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253e716630dSMartin Matuska  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254e716630dSMartin Matuska  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255e716630dSMartin Matuska  * the widths will likely be single digits so we can get a substantial chuck
256e716630dSMartin Matuska  * size using only a few MB of scratch per disk.
257e716630dSMartin Matuska  *
258e716630dSMartin Matuska  * The scratch area is persisted to disk which holds a large amount of reflowed
259e716630dSMartin Matuska  * state. We can always read the partially written stripes when a disk fails or
260e716630dSMartin Matuska  * the copy is interrupted (crash) during the initial copying phase and also
261e716630dSMartin Matuska  * get past a small chunk size restriction.  At a minimum, the scratch space
262e716630dSMartin Matuska  * must be large enough to get us to the point that one row does not overlap
263e716630dSMartin Matuska  * itself when moved (i.e new_width^2).  But going larger is even better. We
264e716630dSMartin Matuska  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265e716630dSMartin Matuska  * as our scratch space to handle overwriting the initial part of the VDEV.
266e716630dSMartin Matuska  *
267e716630dSMartin Matuska  *	0     256K   512K                    4M
268e716630dSMartin Matuska  *	+------+------+-----------------------+-----------------------------
269e716630dSMartin Matuska  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
270e716630dSMartin Matuska  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
271e716630dSMartin Matuska  *	+------+------+-----------------------+-------------------------------
272e716630dSMartin Matuska  *                        Scratch Area
273e716630dSMartin Matuska  *
274e716630dSMartin Matuska  * == Reflow Progress Updates ==
275e716630dSMartin Matuska  * After the initial scratch-based reflow, the expansion process works
276e716630dSMartin Matuska  * similarly to device removal. We create a new open context thread which
277e716630dSMartin Matuska  * reflows the data, and periodically kicks off sync tasks to update logical
278e716630dSMartin Matuska  * state. In this case, state is the committed progress (offset of next data
279e716630dSMartin Matuska  * to copy). We need to persist the completed offset on disk, so that if we
280e716630dSMartin Matuska  * crash we know which format each VDEV offset is in.
281e716630dSMartin Matuska  *
282e716630dSMartin Matuska  * == Time Dependent Geometry ==
283e716630dSMartin Matuska  *
284e716630dSMartin Matuska  * In non-expanded RAIDZ, blocks are read from disk in a column by column
285e716630dSMartin Matuska  * fashion. For a multi-row block, the second sector is in the first column
286e716630dSMartin Matuska  * not in the second column. This allows us to issue full reads for each
287e716630dSMartin Matuska  * column directly into the request buffer. The block data is thus laid out
288e716630dSMartin Matuska  * sequentially in a column-by-column fashion.
289e716630dSMartin Matuska  *
290e716630dSMartin Matuska  * For example, in the before expansion diagram above, one logical block might
291e716630dSMartin Matuska  * be sectors G19-H26. The parity is in G19,H23; and the data is in
292e716630dSMartin Matuska  * G20,H24,G21,H25,G22,H26.
293e716630dSMartin Matuska  *
294e716630dSMartin Matuska  * After a block is reflowed, the sectors that were all in the original column
295e716630dSMartin Matuska  * data can now reside in different columns. When reading from an expanded
296e716630dSMartin Matuska  * VDEV, we need to know the logical stripe width for each block so we can
297e716630dSMartin Matuska  * reconstitute the block’s data after the reads are completed. Likewise,
298e716630dSMartin Matuska  * when we perform the combinatorial reconstruction we need to know the
299e716630dSMartin Matuska  * original width so we can retry combinations from the past layouts.
300e716630dSMartin Matuska  *
301e716630dSMartin Matuska  * Time dependent geometry is what we call having blocks with different layouts
302e716630dSMartin Matuska  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303e716630dSMartin Matuska  * block’s birth time (+ the time expansion ended) to establish the correct
304e716630dSMartin Matuska  * width for a given block. After an expansion completes, we record the time
305e716630dSMartin Matuska  * for blocks written with a particular width (geometry).
306e716630dSMartin Matuska  *
307e716630dSMartin Matuska  * == On Disk Format Changes ==
308e716630dSMartin Matuska  *
309e716630dSMartin Matuska  * New pool feature flag, 'raidz_expansion' whose reference count is the number
310e716630dSMartin Matuska  * of RAIDZ VDEVs that have been expanded.
311e716630dSMartin Matuska  *
312e716630dSMartin Matuska  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313e716630dSMartin Matuska  *
314e716630dSMartin Matuska  * Since the uberblock can point to arbitrary blocks, which might be on the
315e716630dSMartin Matuska  * expanding RAIDZ, and might or might not have been expanded. We need to know
316e716630dSMartin Matuska  * which way a block is laid out before reading it. This info is the next
317e716630dSMartin Matuska  * offset that needs to be reflowed and we persist that in the uberblock, in
318e716630dSMartin Matuska  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319e716630dSMartin Matuska  * After the expansion is complete, we then use the raidz_expand_txgs array
320e716630dSMartin Matuska  * (see below) to determine how to read a block and the ub_raidz_reflow_info
321e716630dSMartin Matuska  * field no longer required.
322e716630dSMartin Matuska  *
323e716630dSMartin Matuska  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324e716630dSMartin Matuska  * state (i.e., active or not) which is also required before reading a block
325e716630dSMartin Matuska  * during the initial phase of reflowing the data.
326e716630dSMartin Matuska  *
327e716630dSMartin Matuska  * The top-level RAIDZ VDEV has two new entries in the nvlist:
328e716630dSMartin Matuska  *
329e716630dSMartin Matuska  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330e716630dSMartin Matuska  *                            and used after the expansion is complete to
331e716630dSMartin Matuska  *                            determine how to read a raidz block
332e716630dSMartin Matuska  * 'raidz_expanding' boolean: present during reflow and removed after completion
333e716630dSMartin Matuska  *                            used during a spa import to resume an unfinished
334e716630dSMartin Matuska  *                            expansion
335e716630dSMartin Matuska  *
336e716630dSMartin Matuska  * And finally the VDEVs top zap adds the following informational entries:
337e716630dSMartin Matuska  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338e716630dSMartin Matuska  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339e716630dSMartin Matuska  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340e716630dSMartin Matuska  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341e716630dSMartin Matuska  */
342e716630dSMartin Matuska 
343e716630dSMartin Matuska /*
344e716630dSMartin Matuska  * For testing only: pause the raidz expansion after reflowing this amount.
345e716630dSMartin Matuska  * (accessed by ZTS and ztest)
346e716630dSMartin Matuska  */
347e716630dSMartin Matuska #ifdef	_KERNEL
348e716630dSMartin Matuska static
349e716630dSMartin Matuska #endif	/* _KERNEL */
350e716630dSMartin Matuska unsigned long raidz_expand_max_reflow_bytes = 0;
351e716630dSMartin Matuska 
352e716630dSMartin Matuska /*
353e716630dSMartin Matuska  * For testing only: pause the raidz expansion at a certain point.
354e716630dSMartin Matuska  */
355e716630dSMartin Matuska uint_t raidz_expand_pause_point = 0;
356e716630dSMartin Matuska 
357e716630dSMartin Matuska /*
358e716630dSMartin Matuska  * Maximum amount of copy io's outstanding at once.
359e716630dSMartin Matuska  */
36017aab35aSMartin Matuska #ifdef _ILP32
36117aab35aSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
36217aab35aSMartin Matuska #else
363e716630dSMartin Matuska static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
36417aab35aSMartin Matuska #endif
365e716630dSMartin Matuska 
366e716630dSMartin Matuska /*
367e716630dSMartin Matuska  * Apply raidz map abds aggregation if the number of rows in the map is equal
368e716630dSMartin Matuska  * or greater than the value below.
369e716630dSMartin Matuska  */
370e716630dSMartin Matuska static unsigned long raidz_io_aggregate_rows = 4;
371e716630dSMartin Matuska 
372e716630dSMartin Matuska /*
373e716630dSMartin Matuska  * Automatically start a pool scrub when a RAIDZ expansion completes in
374e716630dSMartin Matuska  * order to verify the checksums of all blocks which have been copied
375e716630dSMartin Matuska  * during the expansion.  Automatic scrubbing is enabled by default and
376e716630dSMartin Matuska  * is strongly recommended.
377e716630dSMartin Matuska  */
378e716630dSMartin Matuska static int zfs_scrub_after_expand = 1;
379e716630dSMartin Matuska 
3807877fdebSMatt Macy static void
3817877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr)
382eda14cbcSMatt Macy {
383184c1b94SMartin Matuska 	for (int c = 0; c < rr->rr_cols; c++) {
384184c1b94SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
385eda14cbcSMatt Macy 
386184c1b94SMartin Matuska 		if (rc->rc_size != 0)
387184c1b94SMartin Matuska 			abd_free(rc->rc_abd);
388184c1b94SMartin Matuska 		if (rc->rc_orig_data != NULL)
389f9693befSMartin Matuska 			abd_free(rc->rc_orig_data);
390eda14cbcSMatt Macy 	}
391eda14cbcSMatt Macy 
3927877fdebSMatt Macy 	if (rr->rr_abd_empty != NULL)
3937877fdebSMatt Macy 		abd_free(rr->rr_abd_empty);
394eda14cbcSMatt Macy 
3957877fdebSMatt Macy 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
3967877fdebSMatt Macy }
3977877fdebSMatt Macy 
3987877fdebSMatt Macy void
3997877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
4007877fdebSMatt Macy {
4017877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++)
4027877fdebSMatt Macy 		vdev_raidz_row_free(rm->rm_row[i]);
4037877fdebSMatt Macy 
404e716630dSMartin Matuska 	if (rm->rm_nphys_cols) {
405e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
406e716630dSMartin Matuska 			if (rm->rm_phys_col[i].rc_abd != NULL)
407e716630dSMartin Matuska 				abd_free(rm->rm_phys_col[i].rc_abd);
408e716630dSMartin Matuska 		}
409e716630dSMartin Matuska 
410e716630dSMartin Matuska 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
411e716630dSMartin Matuska 		    rm->rm_nphys_cols);
412e716630dSMartin Matuska 	}
413e716630dSMartin Matuska 
414e716630dSMartin Matuska 	ASSERT3P(rm->rm_lr, ==, NULL);
4157877fdebSMatt Macy 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
416eda14cbcSMatt Macy }
417eda14cbcSMatt Macy 
418eda14cbcSMatt Macy static void
419eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
420eda14cbcSMatt Macy {
421eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
422eda14cbcSMatt Macy 
423eda14cbcSMatt Macy 	vdev_raidz_map_free(rm);
424eda14cbcSMatt Macy }
425eda14cbcSMatt Macy 
426e716630dSMartin Matuska static int
427e716630dSMartin Matuska vdev_raidz_reflow_compare(const void *x1, const void *x2)
428e716630dSMartin Matuska {
429e716630dSMartin Matuska 	const reflow_node_t *l = x1;
430e716630dSMartin Matuska 	const reflow_node_t *r = x2;
431e716630dSMartin Matuska 
432e716630dSMartin Matuska 	return (TREE_CMP(l->re_txg, r->re_txg));
433e716630dSMartin Matuska }
434e716630dSMartin Matuska 
435f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = {
436eda14cbcSMatt Macy 	.vsd_free = vdev_raidz_map_free_vsd,
437eda14cbcSMatt Macy };
438eda14cbcSMatt Macy 
439e716630dSMartin Matuska raidz_row_t *
44087bf66d4SMartin Matuska vdev_raidz_row_alloc(int cols, zio_t *zio)
441e716630dSMartin Matuska {
442e716630dSMartin Matuska 	raidz_row_t *rr =
443e716630dSMartin Matuska 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
444e716630dSMartin Matuska 
445e716630dSMartin Matuska 	rr->rr_cols = cols;
446e716630dSMartin Matuska 	rr->rr_scols = cols;
447e716630dSMartin Matuska 
448e716630dSMartin Matuska 	for (int c = 0; c < cols; c++) {
449e716630dSMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
450e716630dSMartin Matuska 		rc->rc_shadow_devidx = INT_MAX;
451e716630dSMartin Matuska 		rc->rc_shadow_offset = UINT64_MAX;
45287bf66d4SMartin Matuska 		/*
45387bf66d4SMartin Matuska 		 * We can not allow self healing to take place for Direct I/O
45487bf66d4SMartin Matuska 		 * reads. There is nothing that stops the buffer contents from
45587bf66d4SMartin Matuska 		 * being manipulated while the I/O is in flight. It is possible
45687bf66d4SMartin Matuska 		 * that the checksum could be verified on the buffer and then
45787bf66d4SMartin Matuska 		 * the contents of that buffer are manipulated afterwards. This
45887bf66d4SMartin Matuska 		 * could lead to bad data being written out during self
45987bf66d4SMartin Matuska 		 * healing.
46087bf66d4SMartin Matuska 		 */
46187bf66d4SMartin Matuska 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
462e716630dSMartin Matuska 			rc->rc_allow_repair = 1;
463e716630dSMartin Matuska 	}
464e716630dSMartin Matuska 	return (rr);
465e716630dSMartin Matuska }
466e716630dSMartin Matuska 
46781b22a98SMartin Matuska static void
46881b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
46981b22a98SMartin Matuska {
47081b22a98SMartin Matuska 	int c;
47181b22a98SMartin Matuska 	int nwrapped = 0;
47281b22a98SMartin Matuska 	uint64_t off = 0;
47381b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
47481b22a98SMartin Matuska 
47581b22a98SMartin Matuska 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
47681b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
47781b22a98SMartin Matuska 
47881b22a98SMartin Matuska 	/*
47981b22a98SMartin Matuska 	 * Pad any parity columns with additional space to account for skip
48081b22a98SMartin Matuska 	 * sectors.
48181b22a98SMartin Matuska 	 */
48281b22a98SMartin Matuska 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
48381b22a98SMartin Matuska 		ASSERT0(rm->rm_skipstart);
48481b22a98SMartin Matuska 		nwrapped = rm->rm_nskip;
48581b22a98SMartin Matuska 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
48681b22a98SMartin Matuska 		nwrapped =
48781b22a98SMartin Matuska 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
48881b22a98SMartin Matuska 	}
48981b22a98SMartin Matuska 
49081b22a98SMartin Matuska 	/*
49181b22a98SMartin Matuska 	 * Optional single skip sectors (rc_size == 0) will be handled in
49281b22a98SMartin Matuska 	 * vdev_raidz_io_start_write().
49381b22a98SMartin Matuska 	 */
49481b22a98SMartin Matuska 	int skipped = rr->rr_scols - rr->rr_cols;
49581b22a98SMartin Matuska 
49681b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
49781b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++) {
49881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
49981b22a98SMartin Matuska 
50081b22a98SMartin Matuska 		/*
50181b22a98SMartin Matuska 		 * Parity columns will pad out a linear ABD to account for
50281b22a98SMartin Matuska 		 * the skip sector. A linear ABD is used here because
50381b22a98SMartin Matuska 		 * parity calculations use the ABD buffer directly to calculate
50481b22a98SMartin Matuska 		 * parity. This avoids doing a memcpy back to the ABD after the
50581b22a98SMartin Matuska 		 * parity has been calculated. By issuing the parity column
50681b22a98SMartin Matuska 		 * with the skip sector we can reduce contention on the child
50781b22a98SMartin Matuska 		 * VDEV queue locks (vq_lock).
50881b22a98SMartin Matuska 		 */
50981b22a98SMartin Matuska 		if (c < nwrapped) {
51081b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(
51181b22a98SMartin Matuska 			    rc->rc_size + (1ULL << ashift), B_FALSE);
51281b22a98SMartin Matuska 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
51381b22a98SMartin Matuska 			skipped++;
51481b22a98SMartin Matuska 		} else {
51581b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
51681b22a98SMartin Matuska 		}
51781b22a98SMartin Matuska 	}
51881b22a98SMartin Matuska 
51981b22a98SMartin Matuska 	for (off = 0; c < rr->rr_cols; c++) {
52081b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
52181b22a98SMartin Matuska 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
52281b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
52381b22a98SMartin Matuska 
52481b22a98SMartin Matuska 		/*
52581b22a98SMartin Matuska 		 * Generate I/O for skip sectors to improve aggregation
52681b22a98SMartin Matuska 		 * continuity. We will use gang ABD's to reduce contention
52781b22a98SMartin Matuska 		 * on the child VDEV queue locks (vq_lock) by issuing
52881b22a98SMartin Matuska 		 * a single I/O that contains the data and skip sector.
52981b22a98SMartin Matuska 		 *
53081b22a98SMartin Matuska 		 * It is important to make sure that rc_size is not updated
53181b22a98SMartin Matuska 		 * even though we are adding a skip sector to the ABD. When
53281b22a98SMartin Matuska 		 * calculating the parity in vdev_raidz_generate_parity_row()
53381b22a98SMartin Matuska 		 * the rc_size is used to iterate through the ABD's. We can
53481b22a98SMartin Matuska 		 * not have zero'd out skip sectors used for calculating
53581b22a98SMartin Matuska 		 * parity for raidz, because those same sectors are not used
53681b22a98SMartin Matuska 		 * during reconstruction.
53781b22a98SMartin Matuska 		 */
53881b22a98SMartin Matuska 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
53981b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_gang();
54081b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
54181b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd,
54281b22a98SMartin Matuska 			    abd_get_zeros(1ULL << ashift), B_TRUE);
54381b22a98SMartin Matuska 			skipped++;
54481b22a98SMartin Matuska 		} else {
54581b22a98SMartin Matuska 			rc->rc_abd = abd;
54681b22a98SMartin Matuska 		}
54781b22a98SMartin Matuska 		off += rc->rc_size;
54881b22a98SMartin Matuska 	}
54981b22a98SMartin Matuska 
55081b22a98SMartin Matuska 	ASSERT3U(off, ==, zio->io_size);
55181b22a98SMartin Matuska 	ASSERT3S(skipped, ==, rm->rm_nskip);
55281b22a98SMartin Matuska }
55381b22a98SMartin Matuska 
55481b22a98SMartin Matuska static void
55581b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
55681b22a98SMartin Matuska {
55781b22a98SMartin Matuska 	int c;
55881b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
55981b22a98SMartin Matuska 
56081b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
56181b22a98SMartin Matuska 
56281b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
56381b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++)
56481b22a98SMartin Matuska 		rr->rr_col[c].rc_abd =
56581b22a98SMartin Matuska 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
56681b22a98SMartin Matuska 
56781b22a98SMartin Matuska 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
56881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
56981b22a98SMartin Matuska 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
57081b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
57181b22a98SMartin Matuska 		off += rc->rc_size;
57281b22a98SMartin Matuska 	}
57381b22a98SMartin Matuska }
57481b22a98SMartin Matuska 
575eda14cbcSMatt Macy /*
576eda14cbcSMatt Macy  * Divides the IO evenly across all child vdevs; usually, dcols is
577eda14cbcSMatt Macy  * the number of children in the target vdev.
578eda14cbcSMatt Macy  *
579eda14cbcSMatt Macy  * Avoid inlining the function to keep vdev_raidz_io_start(), which
580eda14cbcSMatt Macy  * is this functions only caller, as small as possible on the stack.
581eda14cbcSMatt Macy  */
582eda14cbcSMatt Macy noinline raidz_map_t *
583eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
584eda14cbcSMatt Macy     uint64_t nparity)
585eda14cbcSMatt Macy {
5867877fdebSMatt Macy 	raidz_row_t *rr;
587eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
588eda14cbcSMatt Macy 	uint64_t b = zio->io_offset >> ashift;
589eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
590eda14cbcSMatt Macy 	uint64_t s = zio->io_size >> ashift;
591eda14cbcSMatt Macy 	/* The first column for this stripe. */
592eda14cbcSMatt Macy 	uint64_t f = b % dcols;
593eda14cbcSMatt Macy 	/* The starting byte offset on each child vdev. */
594eda14cbcSMatt Macy 	uint64_t o = (b / dcols) << ashift;
595e716630dSMartin Matuska 	uint64_t acols, scols;
596eda14cbcSMatt Macy 
5977877fdebSMatt Macy 	raidz_map_t *rm =
5987877fdebSMatt Macy 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
5997877fdebSMatt Macy 	rm->rm_nrows = 1;
6007877fdebSMatt Macy 
601eda14cbcSMatt Macy 	/*
602eda14cbcSMatt Macy 	 * "Quotient": The number of data sectors for this stripe on all but
603eda14cbcSMatt Macy 	 * the "big column" child vdevs that also contain "remainder" data.
604eda14cbcSMatt Macy 	 */
605e716630dSMartin Matuska 	uint64_t q = s / (dcols - nparity);
606eda14cbcSMatt Macy 
607eda14cbcSMatt Macy 	/*
608eda14cbcSMatt Macy 	 * "Remainder": The number of partial stripe data sectors in this I/O.
609eda14cbcSMatt Macy 	 * This will add a sector to some, but not all, child vdevs.
610eda14cbcSMatt Macy 	 */
611e716630dSMartin Matuska 	uint64_t r = s - q * (dcols - nparity);
612eda14cbcSMatt Macy 
613eda14cbcSMatt Macy 	/* The number of "big columns" - those which contain remainder data. */
614e716630dSMartin Matuska 	uint64_t bc = (r == 0 ? 0 : r + nparity);
615eda14cbcSMatt Macy 
616eda14cbcSMatt Macy 	/*
617eda14cbcSMatt Macy 	 * The total number of data and parity sectors associated with
618eda14cbcSMatt Macy 	 * this I/O.
619eda14cbcSMatt Macy 	 */
620e716630dSMartin Matuska 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
621eda14cbcSMatt Macy 
6227877fdebSMatt Macy 	/*
6237877fdebSMatt Macy 	 * acols: The columns that will be accessed.
6247877fdebSMatt Macy 	 * scols: The columns that will be accessed or skipped.
6257877fdebSMatt Macy 	 */
626eda14cbcSMatt Macy 	if (q == 0) {
627eda14cbcSMatt Macy 		/* Our I/O request doesn't span all child vdevs. */
628eda14cbcSMatt Macy 		acols = bc;
629eda14cbcSMatt Macy 		scols = MIN(dcols, roundup(bc, nparity + 1));
630eda14cbcSMatt Macy 	} else {
631eda14cbcSMatt Macy 		acols = dcols;
632eda14cbcSMatt Macy 		scols = dcols;
633eda14cbcSMatt Macy 	}
634eda14cbcSMatt Macy 
635eda14cbcSMatt Macy 	ASSERT3U(acols, <=, scols);
63687bf66d4SMartin Matuska 	rr = vdev_raidz_row_alloc(scols, zio);
6377877fdebSMatt Macy 	rm->rm_row[0] = rr;
6387877fdebSMatt Macy 	rr->rr_cols = acols;
6397877fdebSMatt Macy 	rr->rr_bigcols = bc;
6407877fdebSMatt Macy 	rr->rr_firstdatacol = nparity;
6417877fdebSMatt Macy #ifdef ZFS_DEBUG
6427877fdebSMatt Macy 	rr->rr_offset = zio->io_offset;
6437877fdebSMatt Macy 	rr->rr_size = zio->io_size;
6447877fdebSMatt Macy #endif
645eda14cbcSMatt Macy 
646e716630dSMartin Matuska 	uint64_t asize = 0;
647eda14cbcSMatt Macy 
648e716630dSMartin Matuska 	for (uint64_t c = 0; c < scols; c++) {
6497877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
650e716630dSMartin Matuska 		uint64_t col = f + c;
651e716630dSMartin Matuska 		uint64_t coff = o;
652eda14cbcSMatt Macy 		if (col >= dcols) {
653eda14cbcSMatt Macy 			col -= dcols;
654eda14cbcSMatt Macy 			coff += 1ULL << ashift;
655eda14cbcSMatt Macy 		}
6567877fdebSMatt Macy 		rc->rc_devidx = col;
6577877fdebSMatt Macy 		rc->rc_offset = coff;
658eda14cbcSMatt Macy 
659eda14cbcSMatt Macy 		if (c >= acols)
6607877fdebSMatt Macy 			rc->rc_size = 0;
661eda14cbcSMatt Macy 		else if (c < bc)
6627877fdebSMatt Macy 			rc->rc_size = (q + 1) << ashift;
663eda14cbcSMatt Macy 		else
6647877fdebSMatt Macy 			rc->rc_size = q << ashift;
665eda14cbcSMatt Macy 
6667877fdebSMatt Macy 		asize += rc->rc_size;
667eda14cbcSMatt Macy 	}
668eda14cbcSMatt Macy 
669eda14cbcSMatt Macy 	ASSERT3U(asize, ==, tot << ashift);
670eda14cbcSMatt Macy 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
6717877fdebSMatt Macy 	rm->rm_skipstart = bc;
672eda14cbcSMatt Macy 
673eda14cbcSMatt Macy 	/*
674eda14cbcSMatt Macy 	 * If all data stored spans all columns, there's a danger that parity
675eda14cbcSMatt Macy 	 * will always be on the same device and, since parity isn't read
676eda14cbcSMatt Macy 	 * during normal operation, that device's I/O bandwidth won't be
677eda14cbcSMatt Macy 	 * used effectively. We therefore switch the parity every 1MB.
678eda14cbcSMatt Macy 	 *
679eda14cbcSMatt Macy 	 * ... at least that was, ostensibly, the theory. As a practical
680eda14cbcSMatt Macy 	 * matter unless we juggle the parity between all devices evenly, we
681eda14cbcSMatt Macy 	 * won't see any benefit. Further, occasional writes that aren't a
682eda14cbcSMatt Macy 	 * multiple of the LCM of the number of children and the minimum
683eda14cbcSMatt Macy 	 * stripe width are sufficient to avoid pessimal behavior.
684eda14cbcSMatt Macy 	 * Unfortunately, this decision created an implicit on-disk format
685eda14cbcSMatt Macy 	 * requirement that we need to support for all eternity, but only
686eda14cbcSMatt Macy 	 * for single-parity RAID-Z.
687eda14cbcSMatt Macy 	 *
688eda14cbcSMatt Macy 	 * If we intend to skip a sector in the zeroth column for padding
689eda14cbcSMatt Macy 	 * we must make sure to note this swap. We will never intend to
690eda14cbcSMatt Macy 	 * skip the first column since at least one data and one parity
691eda14cbcSMatt Macy 	 * column must appear in each row.
692eda14cbcSMatt Macy 	 */
6937877fdebSMatt Macy 	ASSERT(rr->rr_cols >= 2);
6947877fdebSMatt Macy 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
695eda14cbcSMatt Macy 
6967877fdebSMatt Macy 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
697e716630dSMartin Matuska 		uint64_t devidx = rr->rr_col[0].rc_devidx;
6987877fdebSMatt Macy 		o = rr->rr_col[0].rc_offset;
6997877fdebSMatt Macy 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
7007877fdebSMatt Macy 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
7017877fdebSMatt Macy 		rr->rr_col[1].rc_devidx = devidx;
7027877fdebSMatt Macy 		rr->rr_col[1].rc_offset = o;
703eda14cbcSMatt Macy 		if (rm->rm_skipstart == 0)
704eda14cbcSMatt Macy 			rm->rm_skipstart = 1;
705eda14cbcSMatt Macy 	}
706eda14cbcSMatt Macy 
70781b22a98SMartin Matuska 	if (zio->io_type == ZIO_TYPE_WRITE) {
70881b22a98SMartin Matuska 		vdev_raidz_map_alloc_write(zio, rm, ashift);
70981b22a98SMartin Matuska 	} else {
71081b22a98SMartin Matuska 		vdev_raidz_map_alloc_read(zio, rm);
71181b22a98SMartin Matuska 	}
712e716630dSMartin Matuska 	/* init RAIDZ parity ops */
713e716630dSMartin Matuska 	rm->rm_ops = vdev_raidz_math_get_ops();
71481b22a98SMartin Matuska 
715e716630dSMartin Matuska 	return (rm);
716e716630dSMartin Matuska }
717e716630dSMartin Matuska 
718e716630dSMartin Matuska /*
719e716630dSMartin Matuska  * Everything before reflow_offset_synced should have been moved to the new
720e716630dSMartin Matuska  * location (read and write completed).  However, this may not yet be reflected
721e716630dSMartin Matuska  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
722e716630dSMartin Matuska  * uberblock has not yet been written). If reflow is not in progress,
723e716630dSMartin Matuska  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
724e716630dSMartin Matuska  * entirely before reflow_offset_synced, it will come from the new location.
725e716630dSMartin Matuska  * Otherwise this row will come from the old location.  Therefore, rows that
726e716630dSMartin Matuska  * straddle the reflow_offset_synced will come from the old location.
727e716630dSMartin Matuska  *
728e716630dSMartin Matuska  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
729e716630dSMartin Matuska  * been copied, but not yet reflected in the on-disk progress
730e716630dSMartin Matuska  * (reflow_offset_synced), it will also be written to the new (already copied)
731e716630dSMartin Matuska  * offset.
732e716630dSMartin Matuska  */
733e716630dSMartin Matuska noinline raidz_map_t *
734e716630dSMartin Matuska vdev_raidz_map_alloc_expanded(zio_t *zio,
735e716630dSMartin Matuska     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
736e716630dSMartin Matuska     uint64_t nparity, uint64_t reflow_offset_synced,
737e716630dSMartin Matuska     uint64_t reflow_offset_next, boolean_t use_scratch)
738e716630dSMartin Matuska {
739e716630dSMartin Matuska 	abd_t *abd = zio->io_abd;
740e716630dSMartin Matuska 	uint64_t offset = zio->io_offset;
741e716630dSMartin Matuska 	uint64_t size = zio->io_size;
742e716630dSMartin Matuska 
743e716630dSMartin Matuska 	/* The zio's size in units of the vdev's minimum sector size. */
744e716630dSMartin Matuska 	uint64_t s = size >> ashift;
745e716630dSMartin Matuska 
746e716630dSMartin Matuska 	/*
747e716630dSMartin Matuska 	 * "Quotient": The number of data sectors for this stripe on all but
748e716630dSMartin Matuska 	 * the "big column" child vdevs that also contain "remainder" data.
749e716630dSMartin Matuska 	 * AKA "full rows"
750e716630dSMartin Matuska 	 */
751e716630dSMartin Matuska 	uint64_t q = s / (logical_cols - nparity);
752e716630dSMartin Matuska 
753e716630dSMartin Matuska 	/*
754e716630dSMartin Matuska 	 * "Remainder": The number of partial stripe data sectors in this I/O.
755e716630dSMartin Matuska 	 * This will add a sector to some, but not all, child vdevs.
756e716630dSMartin Matuska 	 */
757e716630dSMartin Matuska 	uint64_t r = s - q * (logical_cols - nparity);
758e716630dSMartin Matuska 
759e716630dSMartin Matuska 	/* The number of "big columns" - those which contain remainder data. */
760e716630dSMartin Matuska 	uint64_t bc = (r == 0 ? 0 : r + nparity);
761e716630dSMartin Matuska 
762e716630dSMartin Matuska 	/*
763e716630dSMartin Matuska 	 * The total number of data and parity sectors associated with
764e716630dSMartin Matuska 	 * this I/O.
765e716630dSMartin Matuska 	 */
766e716630dSMartin Matuska 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
767e716630dSMartin Matuska 
768e716630dSMartin Matuska 	/* How many rows contain data (not skip) */
769e716630dSMartin Matuska 	uint64_t rows = howmany(tot, logical_cols);
770e716630dSMartin Matuska 	int cols = MIN(tot, logical_cols);
771e716630dSMartin Matuska 
772e716630dSMartin Matuska 	raidz_map_t *rm =
773e716630dSMartin Matuska 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
774e716630dSMartin Matuska 	    KM_SLEEP);
775e716630dSMartin Matuska 	rm->rm_nrows = rows;
776e716630dSMartin Matuska 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
777e716630dSMartin Matuska 	rm->rm_skipstart = bc;
778e716630dSMartin Matuska 	uint64_t asize = 0;
779e716630dSMartin Matuska 
780e716630dSMartin Matuska 	for (uint64_t row = 0; row < rows; row++) {
781e716630dSMartin Matuska 		boolean_t row_use_scratch = B_FALSE;
78287bf66d4SMartin Matuska 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
783e716630dSMartin Matuska 		rm->rm_row[row] = rr;
784e716630dSMartin Matuska 
785e716630dSMartin Matuska 		/* The starting RAIDZ (parent) vdev sector of the row. */
786e716630dSMartin Matuska 		uint64_t b = (offset >> ashift) + row * logical_cols;
787e716630dSMartin Matuska 
788e716630dSMartin Matuska 		/*
789e716630dSMartin Matuska 		 * If we are in the middle of a reflow, and the copying has
790e716630dSMartin Matuska 		 * not yet completed for any part of this row, then use the
791e716630dSMartin Matuska 		 * old location of this row.  Note that reflow_offset_synced
792e716630dSMartin Matuska 		 * reflects the i/o that's been completed, because it's
793e716630dSMartin Matuska 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
794e716630dSMartin Matuska 		 * This is sufficient for our check, even if that progress
795e716630dSMartin Matuska 		 * has not yet been recorded to disk (reflected in
796e716630dSMartin Matuska 		 * spa_ubsync).  Also note that we consider the last row to
797e716630dSMartin Matuska 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
798e716630dSMartin Matuska 		 * this calculation. This causes a tiny bit of unnecessary
799e716630dSMartin Matuska 		 * double-writes but is safe and simpler to calculate.
800e716630dSMartin Matuska 		 */
801e716630dSMartin Matuska 		int row_phys_cols = physical_cols;
802e716630dSMartin Matuska 		if (b + cols > reflow_offset_synced >> ashift)
803e716630dSMartin Matuska 			row_phys_cols--;
804e716630dSMartin Matuska 		else if (use_scratch)
805e716630dSMartin Matuska 			row_use_scratch = B_TRUE;
806e716630dSMartin Matuska 
807e716630dSMartin Matuska 		/* starting child of this row */
808e716630dSMartin Matuska 		uint64_t child_id = b % row_phys_cols;
809e716630dSMartin Matuska 		/* The starting byte offset on each child vdev. */
810e716630dSMartin Matuska 		uint64_t child_offset = (b / row_phys_cols) << ashift;
811e716630dSMartin Matuska 
812e716630dSMartin Matuska 		/*
813e716630dSMartin Matuska 		 * Note, rr_cols is the entire width of the block, even
814e716630dSMartin Matuska 		 * if this row is shorter.  This is needed because parity
815e716630dSMartin Matuska 		 * generation (for Q and R) needs to know the entire width,
816e716630dSMartin Matuska 		 * because it treats the short row as though it was
817e716630dSMartin Matuska 		 * full-width (and the "phantom" sectors were zero-filled).
818e716630dSMartin Matuska 		 *
819e716630dSMartin Matuska 		 * Another approach to this would be to set cols shorter
820e716630dSMartin Matuska 		 * (to just the number of columns that we might do i/o to)
821e716630dSMartin Matuska 		 * and have another mechanism to tell the parity generation
822e716630dSMartin Matuska 		 * about the "entire width".  Reconstruction (at least
823e716630dSMartin Matuska 		 * vdev_raidz_reconstruct_general()) would also need to
824e716630dSMartin Matuska 		 * know about the "entire width".
825e716630dSMartin Matuska 		 */
826e716630dSMartin Matuska 		rr->rr_firstdatacol = nparity;
827e716630dSMartin Matuska #ifdef ZFS_DEBUG
828e716630dSMartin Matuska 		/*
829e716630dSMartin Matuska 		 * note: rr_size is PSIZE, not ASIZE
830e716630dSMartin Matuska 		 */
831e716630dSMartin Matuska 		rr->rr_offset = b << ashift;
832e716630dSMartin Matuska 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
833e716630dSMartin Matuska #endif
834e716630dSMartin Matuska 
835e716630dSMartin Matuska 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
836e716630dSMartin Matuska 			if (child_id >= row_phys_cols) {
837e716630dSMartin Matuska 				child_id -= row_phys_cols;
838e716630dSMartin Matuska 				child_offset += 1ULL << ashift;
839e716630dSMartin Matuska 			}
840e716630dSMartin Matuska 			raidz_col_t *rc = &rr->rr_col[c];
841e716630dSMartin Matuska 			rc->rc_devidx = child_id;
842e716630dSMartin Matuska 			rc->rc_offset = child_offset;
843e716630dSMartin Matuska 
844e716630dSMartin Matuska 			/*
845e716630dSMartin Matuska 			 * Get this from the scratch space if appropriate.
846e716630dSMartin Matuska 			 * This only happens if we crashed in the middle of
847e716630dSMartin Matuska 			 * raidz_reflow_scratch_sync() (while it's running,
848e716630dSMartin Matuska 			 * the rangelock prevents us from doing concurrent
849e716630dSMartin Matuska 			 * io), and even then only during zpool import or
850e716630dSMartin Matuska 			 * when the pool is imported readonly.
851e716630dSMartin Matuska 			 */
852e716630dSMartin Matuska 			if (row_use_scratch)
853e716630dSMartin Matuska 				rc->rc_offset -= VDEV_BOOT_SIZE;
854e716630dSMartin Matuska 
855e716630dSMartin Matuska 			uint64_t dc = c - rr->rr_firstdatacol;
856e716630dSMartin Matuska 			if (c < rr->rr_firstdatacol) {
857e716630dSMartin Matuska 				rc->rc_size = 1ULL << ashift;
858e716630dSMartin Matuska 
859e716630dSMartin Matuska 				/*
860e716630dSMartin Matuska 				 * Parity sectors' rc_abd's are set below
861e716630dSMartin Matuska 				 * after determining if this is an aggregation.
862e716630dSMartin Matuska 				 */
863e716630dSMartin Matuska 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
864e716630dSMartin Matuska 				/*
865e716630dSMartin Matuska 				 * Past the end of the block (even including
866e716630dSMartin Matuska 				 * skip sectors).  This sector is part of the
867e716630dSMartin Matuska 				 * map so that we have full rows for p/q parity
868e716630dSMartin Matuska 				 * generation.
869e716630dSMartin Matuska 				 */
870e716630dSMartin Matuska 				rc->rc_size = 0;
871e716630dSMartin Matuska 				rc->rc_abd = NULL;
872e716630dSMartin Matuska 			} else {
873e716630dSMartin Matuska 				/* "data column" (col excluding parity) */
874e716630dSMartin Matuska 				uint64_t off;
875e716630dSMartin Matuska 
876e716630dSMartin Matuska 				if (c < bc || r == 0) {
877e716630dSMartin Matuska 					off = dc * rows + row;
878e716630dSMartin Matuska 				} else {
879e716630dSMartin Matuska 					off = r * rows +
880e716630dSMartin Matuska 					    (dc - r) * (rows - 1) + row;
881e716630dSMartin Matuska 				}
882e716630dSMartin Matuska 				rc->rc_size = 1ULL << ashift;
883e716630dSMartin Matuska 				rc->rc_abd = abd_get_offset_struct(
884e716630dSMartin Matuska 				    &rc->rc_abdstruct, abd, off << ashift,
885e716630dSMartin Matuska 				    rc->rc_size);
886e716630dSMartin Matuska 			}
887e716630dSMartin Matuska 
888e716630dSMartin Matuska 			if (rc->rc_size == 0)
889e716630dSMartin Matuska 				continue;
890e716630dSMartin Matuska 
891e716630dSMartin Matuska 			/*
892e716630dSMartin Matuska 			 * If any part of this row is in both old and new
893e716630dSMartin Matuska 			 * locations, the primary location is the old
894e716630dSMartin Matuska 			 * location. If this sector was already copied to the
895e716630dSMartin Matuska 			 * new location, we need to also write to the new,
896e716630dSMartin Matuska 			 * "shadow" location.
897e716630dSMartin Matuska 			 *
898e716630dSMartin Matuska 			 * Note, `row_phys_cols != physical_cols` indicates
899e716630dSMartin Matuska 			 * that the primary location is the old location.
900e716630dSMartin Matuska 			 * `b+c < reflow_offset_next` indicates that the copy
901e716630dSMartin Matuska 			 * to the new location has been initiated. We know
902e716630dSMartin Matuska 			 * that the copy has completed because we have the
903e716630dSMartin Matuska 			 * rangelock, which is held exclusively while the
904e716630dSMartin Matuska 			 * copy is in progress.
905e716630dSMartin Matuska 			 */
906e716630dSMartin Matuska 			if (row_use_scratch ||
907e716630dSMartin Matuska 			    (row_phys_cols != physical_cols &&
908e716630dSMartin Matuska 			    b + c < reflow_offset_next >> ashift)) {
909e716630dSMartin Matuska 				rc->rc_shadow_devidx = (b + c) % physical_cols;
910e716630dSMartin Matuska 				rc->rc_shadow_offset =
911e716630dSMartin Matuska 				    ((b + c) / physical_cols) << ashift;
912e716630dSMartin Matuska 				if (row_use_scratch)
913e716630dSMartin Matuska 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
914e716630dSMartin Matuska 			}
915e716630dSMartin Matuska 
916e716630dSMartin Matuska 			asize += rc->rc_size;
917e716630dSMartin Matuska 		}
918e716630dSMartin Matuska 
919e716630dSMartin Matuska 		/*
920e716630dSMartin Matuska 		 * See comment in vdev_raidz_map_alloc()
921e716630dSMartin Matuska 		 */
922e716630dSMartin Matuska 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
923e716630dSMartin Matuska 		    (offset & (1ULL << 20))) {
924e716630dSMartin Matuska 			ASSERT(rr->rr_cols >= 2);
925e716630dSMartin Matuska 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
926e716630dSMartin Matuska 
927e716630dSMartin Matuska 			int devidx0 = rr->rr_col[0].rc_devidx;
928e716630dSMartin Matuska 			uint64_t offset0 = rr->rr_col[0].rc_offset;
929e716630dSMartin Matuska 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
930e716630dSMartin Matuska 			uint64_t shadow_offset0 =
931e716630dSMartin Matuska 			    rr->rr_col[0].rc_shadow_offset;
932e716630dSMartin Matuska 
933e716630dSMartin Matuska 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
934e716630dSMartin Matuska 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
935e716630dSMartin Matuska 			rr->rr_col[0].rc_shadow_devidx =
936e716630dSMartin Matuska 			    rr->rr_col[1].rc_shadow_devidx;
937e716630dSMartin Matuska 			rr->rr_col[0].rc_shadow_offset =
938e716630dSMartin Matuska 			    rr->rr_col[1].rc_shadow_offset;
939e716630dSMartin Matuska 
940e716630dSMartin Matuska 			rr->rr_col[1].rc_devidx = devidx0;
941e716630dSMartin Matuska 			rr->rr_col[1].rc_offset = offset0;
942e716630dSMartin Matuska 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
943e716630dSMartin Matuska 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
944e716630dSMartin Matuska 		}
945e716630dSMartin Matuska 	}
946e716630dSMartin Matuska 	ASSERT3U(asize, ==, tot << ashift);
947e716630dSMartin Matuska 
948e716630dSMartin Matuska 	/*
949e716630dSMartin Matuska 	 * Determine if the block is contiguous, in which case we can use
950e716630dSMartin Matuska 	 * an aggregation.
951e716630dSMartin Matuska 	 */
952e716630dSMartin Matuska 	if (rows >= raidz_io_aggregate_rows) {
953e716630dSMartin Matuska 		rm->rm_nphys_cols = physical_cols;
954e716630dSMartin Matuska 		rm->rm_phys_col =
955e716630dSMartin Matuska 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
956e716630dSMartin Matuska 		    KM_SLEEP);
957e716630dSMartin Matuska 
958e716630dSMartin Matuska 		/*
959e716630dSMartin Matuska 		 * Determine the aggregate io's offset and size, and check
960e716630dSMartin Matuska 		 * that the io is contiguous.
961e716630dSMartin Matuska 		 */
962e716630dSMartin Matuska 		for (int i = 0;
963e716630dSMartin Matuska 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
964e716630dSMartin Matuska 			raidz_row_t *rr = rm->rm_row[i];
965e716630dSMartin Matuska 			for (int c = 0; c < rr->rr_cols; c++) {
966e716630dSMartin Matuska 				raidz_col_t *rc = &rr->rr_col[c];
967e716630dSMartin Matuska 				raidz_col_t *prc =
968e716630dSMartin Matuska 				    &rm->rm_phys_col[rc->rc_devidx];
969e716630dSMartin Matuska 
970e716630dSMartin Matuska 				if (rc->rc_size == 0)
971e716630dSMartin Matuska 					continue;
972e716630dSMartin Matuska 
973e716630dSMartin Matuska 				if (prc->rc_size == 0) {
974e716630dSMartin Matuska 					ASSERT0(prc->rc_offset);
975e716630dSMartin Matuska 					prc->rc_offset = rc->rc_offset;
976e716630dSMartin Matuska 				} else if (prc->rc_offset + prc->rc_size !=
977e716630dSMartin Matuska 				    rc->rc_offset) {
978e716630dSMartin Matuska 					/*
979e716630dSMartin Matuska 					 * This block is not contiguous and
980e716630dSMartin Matuska 					 * therefore can't be aggregated.
981e716630dSMartin Matuska 					 * This is expected to be rare, so
982e716630dSMartin Matuska 					 * the cost of allocating and then
983e716630dSMartin Matuska 					 * freeing rm_phys_col is not
984e716630dSMartin Matuska 					 * significant.
985e716630dSMartin Matuska 					 */
986e716630dSMartin Matuska 					kmem_free(rm->rm_phys_col,
987e716630dSMartin Matuska 					    sizeof (raidz_col_t) *
988e716630dSMartin Matuska 					    rm->rm_nphys_cols);
989e716630dSMartin Matuska 					rm->rm_phys_col = NULL;
990e716630dSMartin Matuska 					rm->rm_nphys_cols = 0;
991e716630dSMartin Matuska 					break;
992e716630dSMartin Matuska 				}
993e716630dSMartin Matuska 				prc->rc_size += rc->rc_size;
994e716630dSMartin Matuska 			}
995e716630dSMartin Matuska 		}
996e716630dSMartin Matuska 	}
997e716630dSMartin Matuska 	if (rm->rm_phys_col != NULL) {
998e716630dSMartin Matuska 		/*
999e716630dSMartin Matuska 		 * Allocate aggregate ABD's.
1000e716630dSMartin Matuska 		 */
1001e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
1002e716630dSMartin Matuska 			raidz_col_t *prc = &rm->rm_phys_col[i];
1003e716630dSMartin Matuska 
1004e716630dSMartin Matuska 			prc->rc_devidx = i;
1005e716630dSMartin Matuska 
1006e716630dSMartin Matuska 			if (prc->rc_size == 0)
1007e716630dSMartin Matuska 				continue;
1008e716630dSMartin Matuska 
1009e716630dSMartin Matuska 			prc->rc_abd =
1010e716630dSMartin Matuska 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
1011e716630dSMartin Matuska 			    B_FALSE);
1012e716630dSMartin Matuska 		}
1013e716630dSMartin Matuska 
1014e716630dSMartin Matuska 		/*
1015e716630dSMartin Matuska 		 * Point the parity abd's into the aggregate abd's.
1016e716630dSMartin Matuska 		 */
1017e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nrows; i++) {
1018e716630dSMartin Matuska 			raidz_row_t *rr = rm->rm_row[i];
1019e716630dSMartin Matuska 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1020e716630dSMartin Matuska 				raidz_col_t *rc = &rr->rr_col[c];
1021e716630dSMartin Matuska 				raidz_col_t *prc =
1022e716630dSMartin Matuska 				    &rm->rm_phys_col[rc->rc_devidx];
1023e716630dSMartin Matuska 				rc->rc_abd =
1024e716630dSMartin Matuska 				    abd_get_offset_struct(&rc->rc_abdstruct,
1025e716630dSMartin Matuska 				    prc->rc_abd,
1026e716630dSMartin Matuska 				    rc->rc_offset - prc->rc_offset,
1027e716630dSMartin Matuska 				    rc->rc_size);
1028e716630dSMartin Matuska 			}
1029e716630dSMartin Matuska 		}
1030e716630dSMartin Matuska 	} else {
1031e716630dSMartin Matuska 		/*
1032e716630dSMartin Matuska 		 * Allocate new abd's for the parity sectors.
1033e716630dSMartin Matuska 		 */
1034e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nrows; i++) {
1035e716630dSMartin Matuska 			raidz_row_t *rr = rm->rm_row[i];
1036e716630dSMartin Matuska 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1037e716630dSMartin Matuska 				raidz_col_t *rc = &rr->rr_col[c];
1038e716630dSMartin Matuska 				rc->rc_abd =
1039e716630dSMartin Matuska 				    abd_alloc_linear(rc->rc_size,
1040e716630dSMartin Matuska 				    B_TRUE);
1041e716630dSMartin Matuska 			}
1042e716630dSMartin Matuska 		}
1043e716630dSMartin Matuska 	}
1044eda14cbcSMatt Macy 	/* init RAIDZ parity ops */
1045eda14cbcSMatt Macy 	rm->rm_ops = vdev_raidz_math_get_ops();
1046eda14cbcSMatt Macy 
1047eda14cbcSMatt Macy 	return (rm);
1048eda14cbcSMatt Macy }
1049eda14cbcSMatt Macy 
1050eda14cbcSMatt Macy struct pqr_struct {
1051eda14cbcSMatt Macy 	uint64_t *p;
1052eda14cbcSMatt Macy 	uint64_t *q;
1053eda14cbcSMatt Macy 	uint64_t *r;
1054eda14cbcSMatt Macy };
1055eda14cbcSMatt Macy 
1056eda14cbcSMatt Macy static int
1057eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
1058eda14cbcSMatt Macy {
1059eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
1060eda14cbcSMatt Macy 	const uint64_t *src = buf;
1061e716630dSMartin Matuska 	int cnt = size / sizeof (src[0]);
1062eda14cbcSMatt Macy 
1063eda14cbcSMatt Macy 	ASSERT(pqr->p && !pqr->q && !pqr->r);
1064eda14cbcSMatt Macy 
1065e716630dSMartin Matuska 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1066eda14cbcSMatt Macy 		*pqr->p ^= *src;
1067eda14cbcSMatt Macy 
1068eda14cbcSMatt Macy 	return (0);
1069eda14cbcSMatt Macy }
1070eda14cbcSMatt Macy 
1071eda14cbcSMatt Macy static int
1072eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
1073eda14cbcSMatt Macy {
1074eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
1075eda14cbcSMatt Macy 	const uint64_t *src = buf;
1076eda14cbcSMatt Macy 	uint64_t mask;
1077e716630dSMartin Matuska 	int cnt = size / sizeof (src[0]);
1078eda14cbcSMatt Macy 
1079eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && !pqr->r);
1080eda14cbcSMatt Macy 
1081e716630dSMartin Matuska 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1082eda14cbcSMatt Macy 		*pqr->p ^= *src;
1083eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1084eda14cbcSMatt Macy 		*pqr->q ^= *src;
1085eda14cbcSMatt Macy 	}
1086eda14cbcSMatt Macy 
1087eda14cbcSMatt Macy 	return (0);
1088eda14cbcSMatt Macy }
1089eda14cbcSMatt Macy 
1090eda14cbcSMatt Macy static int
1091eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1092eda14cbcSMatt Macy {
1093eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
1094eda14cbcSMatt Macy 	const uint64_t *src = buf;
1095eda14cbcSMatt Macy 	uint64_t mask;
1096e716630dSMartin Matuska 	int cnt = size / sizeof (src[0]);
1097eda14cbcSMatt Macy 
1098eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && pqr->r);
1099eda14cbcSMatt Macy 
1100e716630dSMartin Matuska 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1101eda14cbcSMatt Macy 		*pqr->p ^= *src;
1102eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1103eda14cbcSMatt Macy 		*pqr->q ^= *src;
1104eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1105eda14cbcSMatt Macy 		*pqr->r ^= *src;
1106eda14cbcSMatt Macy 	}
1107eda14cbcSMatt Macy 
1108eda14cbcSMatt Macy 	return (0);
1109eda14cbcSMatt Macy }
1110eda14cbcSMatt Macy 
1111eda14cbcSMatt Macy static void
11127877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr)
1113eda14cbcSMatt Macy {
11147877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1115eda14cbcSMatt Macy 
11167877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11177877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
1118eda14cbcSMatt Macy 
11197877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
11207877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1121eda14cbcSMatt Macy 		} else {
1122eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, NULL, NULL };
11237877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1124eda14cbcSMatt Macy 			    vdev_raidz_p_func, &pqr);
1125eda14cbcSMatt Macy 		}
1126eda14cbcSMatt Macy 	}
1127eda14cbcSMatt Macy }
1128eda14cbcSMatt Macy 
1129eda14cbcSMatt Macy static void
11307877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1131eda14cbcSMatt Macy {
11327877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
11337877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
11347877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
11357877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11367877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1137eda14cbcSMatt Macy 
11387877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11397877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
1140eda14cbcSMatt Macy 
11417877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1142eda14cbcSMatt Macy 
11437877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
1144eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
11457877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
11467877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1147eda14cbcSMatt Macy 
11487877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
1149eda14cbcSMatt Macy 				p[i] = 0;
1150eda14cbcSMatt Macy 				q[i] = 0;
1151eda14cbcSMatt Macy 			}
1152eda14cbcSMatt Macy 		} else {
1153eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, NULL };
1154eda14cbcSMatt Macy 
1155eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
11567877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1157eda14cbcSMatt Macy 			    vdev_raidz_pq_func, &pqr);
1158eda14cbcSMatt Macy 
1159eda14cbcSMatt Macy 			/*
1160eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
1161eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
1162eda14cbcSMatt Macy 			 */
11637877fdebSMatt Macy 			uint64_t mask;
11647877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
1165eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1166eda14cbcSMatt Macy 			}
1167eda14cbcSMatt Macy 		}
1168eda14cbcSMatt Macy 	}
1169eda14cbcSMatt Macy }
1170eda14cbcSMatt Macy 
1171eda14cbcSMatt Macy static void
11727877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1173eda14cbcSMatt Macy {
11747877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
11757877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
11767877fdebSMatt Macy 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
11777877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
11787877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11797877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
11807877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
11817877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1182eda14cbcSMatt Macy 
11837877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
11847877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
1185eda14cbcSMatt Macy 
11867877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1187eda14cbcSMatt Macy 
11887877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
1189eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
11907877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
11917877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
11927877fdebSMatt Macy 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1193eda14cbcSMatt Macy 
11947877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
1195eda14cbcSMatt Macy 				p[i] = 0;
1196eda14cbcSMatt Macy 				q[i] = 0;
1197eda14cbcSMatt Macy 				r[i] = 0;
1198eda14cbcSMatt Macy 			}
1199eda14cbcSMatt Macy 		} else {
1200eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, r };
1201eda14cbcSMatt Macy 
1202eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
12037877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1204eda14cbcSMatt Macy 			    vdev_raidz_pqr_func, &pqr);
1205eda14cbcSMatt Macy 
1206eda14cbcSMatt Macy 			/*
1207eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
1208eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
1209eda14cbcSMatt Macy 			 */
12107877fdebSMatt Macy 			uint64_t mask;
12117877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
1212eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
1213eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_4(r[i], mask);
1214eda14cbcSMatt Macy 			}
1215eda14cbcSMatt Macy 		}
1216eda14cbcSMatt Macy 	}
1217eda14cbcSMatt Macy }
1218eda14cbcSMatt Macy 
1219eda14cbcSMatt Macy /*
1220eda14cbcSMatt Macy  * Generate RAID parity in the first virtual columns according to the number of
1221eda14cbcSMatt Macy  * parity columns available.
1222eda14cbcSMatt Macy  */
1223eda14cbcSMatt Macy void
12247877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1225eda14cbcSMatt Macy {
1226e716630dSMartin Matuska 	if (rr->rr_cols == 0) {
1227e716630dSMartin Matuska 		/*
1228e716630dSMartin Matuska 		 * We are handling this block one row at a time (because
1229e716630dSMartin Matuska 		 * this block has a different logical vs physical width,
1230e716630dSMartin Matuska 		 * due to RAIDZ expansion), and this is a pad-only row,
1231e716630dSMartin Matuska 		 * which has no parity.
1232e716630dSMartin Matuska 		 */
1233e716630dSMartin Matuska 		return;
1234e716630dSMartin Matuska 	}
12357877fdebSMatt Macy 
1236eda14cbcSMatt Macy 	/* Generate using the new math implementation */
12377877fdebSMatt Macy 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1238eda14cbcSMatt Macy 		return;
1239eda14cbcSMatt Macy 
12407877fdebSMatt Macy 	switch (rr->rr_firstdatacol) {
1241eda14cbcSMatt Macy 	case 1:
12427877fdebSMatt Macy 		vdev_raidz_generate_parity_p(rr);
1243eda14cbcSMatt Macy 		break;
1244eda14cbcSMatt Macy 	case 2:
12457877fdebSMatt Macy 		vdev_raidz_generate_parity_pq(rr);
1246eda14cbcSMatt Macy 		break;
1247eda14cbcSMatt Macy 	case 3:
12487877fdebSMatt Macy 		vdev_raidz_generate_parity_pqr(rr);
1249eda14cbcSMatt Macy 		break;
1250eda14cbcSMatt Macy 	default:
1251eda14cbcSMatt Macy 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1252eda14cbcSMatt Macy 	}
1253eda14cbcSMatt Macy }
1254eda14cbcSMatt Macy 
12557877fdebSMatt Macy void
12567877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
12577877fdebSMatt Macy {
12587877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
12597877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
12607877fdebSMatt Macy 		vdev_raidz_generate_parity_row(rm, rr);
12617877fdebSMatt Macy 	}
12627877fdebSMatt Macy }
12637877fdebSMatt Macy 
1264eda14cbcSMatt Macy static int
1265eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1266eda14cbcSMatt Macy {
1267e92ffd9bSMartin Matuska 	(void) private;
1268eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
1269eda14cbcSMatt Macy 	uint64_t *src = sbuf;
1270eda14cbcSMatt Macy 	int cnt = size / sizeof (src[0]);
1271eda14cbcSMatt Macy 
1272eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++) {
1273eda14cbcSMatt Macy 		dst[i] ^= src[i];
1274eda14cbcSMatt Macy 	}
1275eda14cbcSMatt Macy 
1276eda14cbcSMatt Macy 	return (0);
1277eda14cbcSMatt Macy }
1278eda14cbcSMatt Macy 
1279eda14cbcSMatt Macy static int
1280eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1281eda14cbcSMatt Macy     void *private)
1282eda14cbcSMatt Macy {
1283e92ffd9bSMartin Matuska 	(void) private;
1284eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
1285eda14cbcSMatt Macy 	uint64_t *src = sbuf;
1286eda14cbcSMatt Macy 	uint64_t mask;
1287eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
1288eda14cbcSMatt Macy 
1289eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, src++) {
1290eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1291eda14cbcSMatt Macy 		*dst ^= *src;
1292eda14cbcSMatt Macy 	}
1293eda14cbcSMatt Macy 
1294eda14cbcSMatt Macy 	return (0);
1295eda14cbcSMatt Macy }
1296eda14cbcSMatt Macy 
1297eda14cbcSMatt Macy static int
1298eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1299eda14cbcSMatt Macy {
1300e92ffd9bSMartin Matuska 	(void) private;
1301eda14cbcSMatt Macy 	uint64_t *dst = buf;
1302eda14cbcSMatt Macy 	uint64_t mask;
1303eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
1304eda14cbcSMatt Macy 
1305eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++) {
1306eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1307eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
1308eda14cbcSMatt Macy 	}
1309eda14cbcSMatt Macy 
1310eda14cbcSMatt Macy 	return (0);
1311eda14cbcSMatt Macy }
1312eda14cbcSMatt Macy 
1313eda14cbcSMatt Macy struct reconst_q_struct {
1314eda14cbcSMatt Macy 	uint64_t *q;
1315eda14cbcSMatt Macy 	int exp;
1316eda14cbcSMatt Macy };
1317eda14cbcSMatt Macy 
1318eda14cbcSMatt Macy static int
1319eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1320eda14cbcSMatt Macy {
1321eda14cbcSMatt Macy 	struct reconst_q_struct *rq = private;
1322eda14cbcSMatt Macy 	uint64_t *dst = buf;
1323eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
1324eda14cbcSMatt Macy 
1325eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1326eda14cbcSMatt Macy 		int j;
1327eda14cbcSMatt Macy 		uint8_t *b;
1328eda14cbcSMatt Macy 
1329eda14cbcSMatt Macy 		*dst ^= *rq->q;
1330eda14cbcSMatt Macy 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1331eda14cbcSMatt Macy 			*b = vdev_raidz_exp2(*b, rq->exp);
1332eda14cbcSMatt Macy 		}
1333eda14cbcSMatt Macy 	}
1334eda14cbcSMatt Macy 
1335eda14cbcSMatt Macy 	return (0);
1336eda14cbcSMatt Macy }
1337eda14cbcSMatt Macy 
1338eda14cbcSMatt Macy struct reconst_pq_struct {
1339eda14cbcSMatt Macy 	uint8_t *p;
1340eda14cbcSMatt Macy 	uint8_t *q;
1341eda14cbcSMatt Macy 	uint8_t *pxy;
1342eda14cbcSMatt Macy 	uint8_t *qxy;
1343eda14cbcSMatt Macy 	int aexp;
1344eda14cbcSMatt Macy 	int bexp;
1345eda14cbcSMatt Macy };
1346eda14cbcSMatt Macy 
1347eda14cbcSMatt Macy static int
1348eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1349eda14cbcSMatt Macy {
1350eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
1351eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
1352eda14cbcSMatt Macy 	uint8_t *yd = ybuf;
1353eda14cbcSMatt Macy 
1354eda14cbcSMatt Macy 	for (int i = 0; i < size;
1355eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1356eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1357eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1358eda14cbcSMatt Macy 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1359eda14cbcSMatt Macy 	}
1360eda14cbcSMatt Macy 
1361eda14cbcSMatt Macy 	return (0);
1362eda14cbcSMatt Macy }
1363eda14cbcSMatt Macy 
1364eda14cbcSMatt Macy static int
1365eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1366eda14cbcSMatt Macy {
1367eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
1368eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
1369eda14cbcSMatt Macy 
1370eda14cbcSMatt Macy 	for (int i = 0; i < size;
1371eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1372eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1373eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1374eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1375eda14cbcSMatt Macy 	}
1376eda14cbcSMatt Macy 
1377eda14cbcSMatt Macy 	return (0);
1378eda14cbcSMatt Macy }
1379eda14cbcSMatt Macy 
1380f9693befSMartin Matuska static void
13817877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1382eda14cbcSMatt Macy {
1383eda14cbcSMatt Macy 	int x = tgts[0];
1384eda14cbcSMatt Macy 	abd_t *dst, *src;
1385eda14cbcSMatt Macy 
1386e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1387e716630dSMartin Matuska 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1388e716630dSMartin Matuska 
13897877fdebSMatt Macy 	ASSERT3U(ntgts, ==, 1);
13907877fdebSMatt Macy 	ASSERT3U(x, >=, rr->rr_firstdatacol);
13917877fdebSMatt Macy 	ASSERT3U(x, <, rr->rr_cols);
1392eda14cbcSMatt Macy 
13937877fdebSMatt Macy 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1394eda14cbcSMatt Macy 
13957877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
13967877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
1397eda14cbcSMatt Macy 
13987877fdebSMatt Macy 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1399eda14cbcSMatt Macy 
14007877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14017877fdebSMatt Macy 		uint64_t size = MIN(rr->rr_col[x].rc_size,
14027877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
1403eda14cbcSMatt Macy 
14047877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
1405eda14cbcSMatt Macy 
1406eda14cbcSMatt Macy 		if (c == x)
1407eda14cbcSMatt Macy 			continue;
1408eda14cbcSMatt Macy 
1409eda14cbcSMatt Macy 		(void) abd_iterate_func2(dst, src, 0, 0, size,
1410eda14cbcSMatt Macy 		    vdev_raidz_reconst_p_func, NULL);
1411eda14cbcSMatt Macy 	}
1412eda14cbcSMatt Macy }
1413eda14cbcSMatt Macy 
1414f9693befSMartin Matuska static void
14157877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1416eda14cbcSMatt Macy {
1417eda14cbcSMatt Macy 	int x = tgts[0];
1418eda14cbcSMatt Macy 	int c, exp;
1419eda14cbcSMatt Macy 	abd_t *dst, *src;
1420eda14cbcSMatt Macy 
1421e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1422e716630dSMartin Matuska 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1423e716630dSMartin Matuska 
1424eda14cbcSMatt Macy 	ASSERT(ntgts == 1);
1425eda14cbcSMatt Macy 
14267877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1427eda14cbcSMatt Macy 
14287877fdebSMatt Macy 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14297877fdebSMatt Macy 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
14307877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
1431eda14cbcSMatt Macy 
14327877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
14337877fdebSMatt Macy 		dst = rr->rr_col[x].rc_abd;
1434eda14cbcSMatt Macy 
14357877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
1436eda14cbcSMatt Macy 			abd_copy(dst, src, size);
14377877fdebSMatt Macy 			if (rr->rr_col[x].rc_size > size) {
1438eda14cbcSMatt Macy 				abd_zero_off(dst, size,
14397877fdebSMatt Macy 				    rr->rr_col[x].rc_size - size);
14407877fdebSMatt Macy 			}
1441eda14cbcSMatt Macy 		} else {
14427877fdebSMatt Macy 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1443eda14cbcSMatt Macy 			(void) abd_iterate_func2(dst, src, 0, 0, size,
1444eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_func, NULL);
1445eda14cbcSMatt Macy 			(void) abd_iterate_func(dst,
14467877fdebSMatt Macy 			    size, rr->rr_col[x].rc_size - size,
1447eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1448eda14cbcSMatt Macy 		}
1449eda14cbcSMatt Macy 	}
1450eda14cbcSMatt Macy 
14517877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14527877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
14537877fdebSMatt Macy 	exp = 255 - (rr->rr_cols - 1 - x);
1454eda14cbcSMatt Macy 
1455eda14cbcSMatt Macy 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
14567877fdebSMatt Macy 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1457eda14cbcSMatt Macy 	    vdev_raidz_reconst_q_post_func, &rq);
1458eda14cbcSMatt Macy }
1459eda14cbcSMatt Macy 
1460f9693befSMartin Matuska static void
14617877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1462eda14cbcSMatt Macy {
1463eda14cbcSMatt Macy 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1464eda14cbcSMatt Macy 	abd_t *pdata, *qdata;
1465eda14cbcSMatt Macy 	uint64_t xsize, ysize;
1466eda14cbcSMatt Macy 	int x = tgts[0];
1467eda14cbcSMatt Macy 	int y = tgts[1];
1468eda14cbcSMatt Macy 	abd_t *xd, *yd;
1469eda14cbcSMatt Macy 
1470e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1471e716630dSMartin Matuska 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1472e716630dSMartin Matuska 
1473eda14cbcSMatt Macy 	ASSERT(ntgts == 2);
1474eda14cbcSMatt Macy 	ASSERT(x < y);
14757877fdebSMatt Macy 	ASSERT(x >= rr->rr_firstdatacol);
14767877fdebSMatt Macy 	ASSERT(y < rr->rr_cols);
1477eda14cbcSMatt Macy 
14787877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1479eda14cbcSMatt Macy 
1480eda14cbcSMatt Macy 	/*
1481eda14cbcSMatt Macy 	 * Move the parity data aside -- we're going to compute parity as
1482eda14cbcSMatt Macy 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1483eda14cbcSMatt Macy 	 * reuse the parity generation mechanism without trashing the actual
1484eda14cbcSMatt Macy 	 * parity so we make those columns appear to be full of zeros by
1485eda14cbcSMatt Macy 	 * setting their lengths to zero.
1486eda14cbcSMatt Macy 	 */
14877877fdebSMatt Macy 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
14887877fdebSMatt Macy 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14897877fdebSMatt Macy 	xsize = rr->rr_col[x].rc_size;
14907877fdebSMatt Macy 	ysize = rr->rr_col[y].rc_size;
1491eda14cbcSMatt Macy 
14927877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
14937877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
14947877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
14957877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
14967877fdebSMatt Macy 	rr->rr_col[x].rc_size = 0;
14977877fdebSMatt Macy 	rr->rr_col[y].rc_size = 0;
1498eda14cbcSMatt Macy 
14997877fdebSMatt Macy 	vdev_raidz_generate_parity_pq(rr);
1500eda14cbcSMatt Macy 
15017877fdebSMatt Macy 	rr->rr_col[x].rc_size = xsize;
15027877fdebSMatt Macy 	rr->rr_col[y].rc_size = ysize;
1503eda14cbcSMatt Macy 
1504eda14cbcSMatt Macy 	p = abd_to_buf(pdata);
1505eda14cbcSMatt Macy 	q = abd_to_buf(qdata);
15067877fdebSMatt Macy 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
15077877fdebSMatt Macy 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
15087877fdebSMatt Macy 	xd = rr->rr_col[x].rc_abd;
15097877fdebSMatt Macy 	yd = rr->rr_col[y].rc_abd;
1510eda14cbcSMatt Macy 
1511eda14cbcSMatt Macy 	/*
1512eda14cbcSMatt Macy 	 * We now have:
1513eda14cbcSMatt Macy 	 *	Pxy = P + D_x + D_y
1514eda14cbcSMatt Macy 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1515eda14cbcSMatt Macy 	 *
1516eda14cbcSMatt Macy 	 * We can then solve for D_x:
1517eda14cbcSMatt Macy 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1518eda14cbcSMatt Macy 	 * where
1519eda14cbcSMatt Macy 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1520eda14cbcSMatt Macy 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1521eda14cbcSMatt Macy 	 *
1522eda14cbcSMatt Macy 	 * With D_x in hand, we can easily solve for D_y:
1523eda14cbcSMatt Macy 	 *	D_y = P + Pxy + D_x
1524eda14cbcSMatt Macy 	 */
1525eda14cbcSMatt Macy 
1526eda14cbcSMatt Macy 	a = vdev_raidz_pow2[255 + x - y];
15277877fdebSMatt Macy 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1528eda14cbcSMatt Macy 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1529eda14cbcSMatt Macy 
1530eda14cbcSMatt Macy 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1531eda14cbcSMatt Macy 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1532eda14cbcSMatt Macy 
1533eda14cbcSMatt Macy 	ASSERT3U(xsize, >=, ysize);
1534eda14cbcSMatt Macy 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1535eda14cbcSMatt Macy 
1536eda14cbcSMatt Macy 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1537eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_func, &rpq);
1538eda14cbcSMatt Macy 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1539eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1540eda14cbcSMatt Macy 
15417877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
15427877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1543eda14cbcSMatt Macy 
1544eda14cbcSMatt Macy 	/*
1545eda14cbcSMatt Macy 	 * Restore the saved parity data.
1546eda14cbcSMatt Macy 	 */
15477877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
15487877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1549eda14cbcSMatt Macy }
1550eda14cbcSMatt Macy 
1551eda14cbcSMatt Macy /*
1552eda14cbcSMatt Macy  * In the general case of reconstruction, we must solve the system of linear
1553eda14cbcSMatt Macy  * equations defined by the coefficients used to generate parity as well as
1554eda14cbcSMatt Macy  * the contents of the data and parity disks. This can be expressed with
1555eda14cbcSMatt Macy  * vectors for the original data (D) and the actual data (d) and parity (p)
1556eda14cbcSMatt Macy  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1557eda14cbcSMatt Macy  *
1558eda14cbcSMatt Macy  *            __   __                     __     __
1559eda14cbcSMatt Macy  *            |     |         __     __   |  p_0  |
1560eda14cbcSMatt Macy  *            |  V  |         |  D_0  |   | p_m-1 |
1561eda14cbcSMatt Macy  *            |     |    x    |   :   | = |  d_0  |
1562eda14cbcSMatt Macy  *            |  I  |         | D_n-1 |   |   :   |
1563eda14cbcSMatt Macy  *            |     |         ~~     ~~   | d_n-1 |
1564eda14cbcSMatt Macy  *            ~~   ~~                     ~~     ~~
1565eda14cbcSMatt Macy  *
1566eda14cbcSMatt Macy  * I is simply a square identity matrix of size n, and V is a vandermonde
1567eda14cbcSMatt Macy  * matrix defined by the coefficients we chose for the various parity columns
1568eda14cbcSMatt Macy  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1569eda14cbcSMatt Macy  * computation as well as linear separability.
1570eda14cbcSMatt Macy  *
1571eda14cbcSMatt Macy  *      __               __               __     __
1572eda14cbcSMatt Macy  *      |   1   ..  1 1 1 |               |  p_0  |
1573eda14cbcSMatt Macy  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1574eda14cbcSMatt Macy  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1575eda14cbcSMatt Macy  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1576eda14cbcSMatt Macy  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1577eda14cbcSMatt Macy  *      |   :       : : : |   |   :   |   |  d_2  |
1578eda14cbcSMatt Macy  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1579eda14cbcSMatt Macy  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1580eda14cbcSMatt Macy  *      |   0   ..  0 0 1 |               | d_n-1 |
1581eda14cbcSMatt Macy  *      ~~               ~~               ~~     ~~
1582eda14cbcSMatt Macy  *
1583eda14cbcSMatt Macy  * Note that I, V, d, and p are known. To compute D, we must invert the
1584eda14cbcSMatt Macy  * matrix and use the known data and parity values to reconstruct the unknown
1585eda14cbcSMatt Macy  * data values. We begin by removing the rows in V|I and d|p that correspond
1586eda14cbcSMatt Macy  * to failed or missing columns; we then make V|I square (n x n) and d|p
1587eda14cbcSMatt Macy  * sized n by removing rows corresponding to unused parity from the bottom up
1588eda14cbcSMatt Macy  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1589eda14cbcSMatt Macy  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1590eda14cbcSMatt Macy  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1591eda14cbcSMatt Macy  *           __                               __
1592eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
1593eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1594eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |      / /
1595eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |     / /
1596eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  | <--' /
1597eda14cbcSMatt Macy  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1598eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
1599eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
1600eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
1601eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
1602eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
1603eda14cbcSMatt Macy  *           ~~                               ~~
1604eda14cbcSMatt Macy  *           __                               __
1605eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
1606eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  |
1607eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |
1608eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |
1609eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  |
1610eda14cbcSMatt Macy  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1611eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
1612eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
1613eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
1614eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
1615eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
1616eda14cbcSMatt Macy  *           ~~                               ~~
1617eda14cbcSMatt Macy  *
1618eda14cbcSMatt Macy  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1619eda14cbcSMatt Macy  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1620eda14cbcSMatt Macy  * matrix is not singular.
1621eda14cbcSMatt Macy  * __                                                                 __
1622eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1623eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1624eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1625eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1626eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1627eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1628eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1629eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1630eda14cbcSMatt Macy  * ~~                                                                 ~~
1631eda14cbcSMatt Macy  * __                                                                 __
1632eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1633eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1634eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1635eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1636eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1637eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1638eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1639eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1640eda14cbcSMatt Macy  * ~~                                                                 ~~
1641eda14cbcSMatt Macy  * __                                                                 __
1642eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1643eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1644eda14cbcSMatt Macy  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1645eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1646eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1647eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1648eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1649eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1650eda14cbcSMatt Macy  * ~~                                                                 ~~
1651eda14cbcSMatt Macy  * __                                                                 __
1652eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1653eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1654eda14cbcSMatt Macy  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1655eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1656eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1657eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1658eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1659eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1660eda14cbcSMatt Macy  * ~~                                                                 ~~
1661eda14cbcSMatt Macy  * __                                                                 __
1662eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1663eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1664eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1665eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1666eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1667eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1668eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1669eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1670eda14cbcSMatt Macy  * ~~                                                                 ~~
1671eda14cbcSMatt Macy  * __                                                                 __
1672eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1673eda14cbcSMatt Macy  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1674eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1675eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1676eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1677eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1678eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1679eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1680eda14cbcSMatt Macy  * ~~                                                                 ~~
1681eda14cbcSMatt Macy  *                   __                               __
1682eda14cbcSMatt Macy  *                   |  0   0   1   0   0   0   0   0  |
1683eda14cbcSMatt Macy  *                   | 167 100  5   41 159 169 217 208 |
1684eda14cbcSMatt Macy  *                   | 166 100  4   40 158 168 216 209 |
1685eda14cbcSMatt Macy  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1686eda14cbcSMatt Macy  *                   |  0   0   0   0   1   0   0   0  |
1687eda14cbcSMatt Macy  *                   |  0   0   0   0   0   1   0   0  |
1688eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   1   0  |
1689eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   0   1  |
1690eda14cbcSMatt Macy  *                   ~~                               ~~
1691eda14cbcSMatt Macy  *
1692eda14cbcSMatt Macy  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1693eda14cbcSMatt Macy  * of the missing data.
1694eda14cbcSMatt Macy  *
1695eda14cbcSMatt Macy  * As is apparent from the example above, the only non-trivial rows in the
1696eda14cbcSMatt Macy  * inverse matrix correspond to the data disks that we're trying to
1697eda14cbcSMatt Macy  * reconstruct. Indeed, those are the only rows we need as the others would
1698eda14cbcSMatt Macy  * only be useful for reconstructing data known or assumed to be valid. For
1699eda14cbcSMatt Macy  * that reason, we only build the coefficients in the rows that correspond to
1700eda14cbcSMatt Macy  * targeted columns.
1701eda14cbcSMatt Macy  */
1702eda14cbcSMatt Macy 
1703eda14cbcSMatt Macy static void
17047877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1705eda14cbcSMatt Macy     uint8_t **rows)
1706eda14cbcSMatt Macy {
1707eda14cbcSMatt Macy 	int i, j;
1708eda14cbcSMatt Macy 	int pow;
1709eda14cbcSMatt Macy 
17107877fdebSMatt Macy 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1711eda14cbcSMatt Macy 
1712eda14cbcSMatt Macy 	/*
1713eda14cbcSMatt Macy 	 * Fill in the missing rows of interest.
1714eda14cbcSMatt Macy 	 */
1715eda14cbcSMatt Macy 	for (i = 0; i < nmap; i++) {
1716eda14cbcSMatt Macy 		ASSERT3S(0, <=, map[i]);
1717eda14cbcSMatt Macy 		ASSERT3S(map[i], <=, 2);
1718eda14cbcSMatt Macy 
1719eda14cbcSMatt Macy 		pow = map[i] * n;
1720eda14cbcSMatt Macy 		if (pow > 255)
1721eda14cbcSMatt Macy 			pow -= 255;
1722eda14cbcSMatt Macy 		ASSERT(pow <= 255);
1723eda14cbcSMatt Macy 
1724eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1725eda14cbcSMatt Macy 			pow -= map[i];
1726eda14cbcSMatt Macy 			if (pow < 0)
1727eda14cbcSMatt Macy 				pow += 255;
1728eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_pow2[pow];
1729eda14cbcSMatt Macy 		}
1730eda14cbcSMatt Macy 	}
1731eda14cbcSMatt Macy }
1732eda14cbcSMatt Macy 
1733eda14cbcSMatt Macy static void
17347877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1735eda14cbcSMatt Macy     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1736eda14cbcSMatt Macy {
1737eda14cbcSMatt Macy 	int i, j, ii, jj;
1738eda14cbcSMatt Macy 	uint8_t log;
1739eda14cbcSMatt Macy 
1740eda14cbcSMatt Macy 	/*
1741eda14cbcSMatt Macy 	 * Assert that the first nmissing entries from the array of used
1742eda14cbcSMatt Macy 	 * columns correspond to parity columns and that subsequent entries
1743eda14cbcSMatt Macy 	 * correspond to data columns.
1744eda14cbcSMatt Macy 	 */
1745eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
17467877fdebSMatt Macy 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1747eda14cbcSMatt Macy 	}
1748eda14cbcSMatt Macy 	for (; i < n; i++) {
17497877fdebSMatt Macy 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1750eda14cbcSMatt Macy 	}
1751eda14cbcSMatt Macy 
1752eda14cbcSMatt Macy 	/*
1753eda14cbcSMatt Macy 	 * First initialize the storage where we'll compute the inverse rows.
1754eda14cbcSMatt Macy 	 */
1755eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1756eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1757eda14cbcSMatt Macy 			invrows[i][j] = (i == j) ? 1 : 0;
1758eda14cbcSMatt Macy 		}
1759eda14cbcSMatt Macy 	}
1760eda14cbcSMatt Macy 
1761eda14cbcSMatt Macy 	/*
1762eda14cbcSMatt Macy 	 * Subtract all trivial rows from the rows of consequence.
1763eda14cbcSMatt Macy 	 */
1764eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1765eda14cbcSMatt Macy 		for (j = nmissing; j < n; j++) {
17667877fdebSMatt Macy 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
17677877fdebSMatt Macy 			jj = used[j] - rr->rr_firstdatacol;
1768eda14cbcSMatt Macy 			ASSERT3S(jj, <, n);
1769eda14cbcSMatt Macy 			invrows[i][j] = rows[i][jj];
1770eda14cbcSMatt Macy 			rows[i][jj] = 0;
1771eda14cbcSMatt Macy 		}
1772eda14cbcSMatt Macy 	}
1773eda14cbcSMatt Macy 
1774eda14cbcSMatt Macy 	/*
1775eda14cbcSMatt Macy 	 * For each of the rows of interest, we must normalize it and subtract
1776eda14cbcSMatt Macy 	 * a multiple of it from the other rows.
1777eda14cbcSMatt Macy 	 */
1778eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1779eda14cbcSMatt Macy 		for (j = 0; j < missing[i]; j++) {
1780eda14cbcSMatt Macy 			ASSERT0(rows[i][j]);
1781eda14cbcSMatt Macy 		}
1782eda14cbcSMatt Macy 		ASSERT3U(rows[i][missing[i]], !=, 0);
1783eda14cbcSMatt Macy 
1784eda14cbcSMatt Macy 		/*
1785eda14cbcSMatt Macy 		 * Compute the inverse of the first element and multiply each
1786eda14cbcSMatt Macy 		 * element in the row by that value.
1787eda14cbcSMatt Macy 		 */
1788eda14cbcSMatt Macy 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1789eda14cbcSMatt Macy 
1790eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1791eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1792eda14cbcSMatt Macy 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1793eda14cbcSMatt Macy 		}
1794eda14cbcSMatt Macy 
1795eda14cbcSMatt Macy 		for (ii = 0; ii < nmissing; ii++) {
1796eda14cbcSMatt Macy 			if (i == ii)
1797eda14cbcSMatt Macy 				continue;
1798eda14cbcSMatt Macy 
1799eda14cbcSMatt Macy 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1800eda14cbcSMatt Macy 
1801eda14cbcSMatt Macy 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1802eda14cbcSMatt Macy 
1803eda14cbcSMatt Macy 			for (j = 0; j < n; j++) {
1804eda14cbcSMatt Macy 				rows[ii][j] ^=
1805eda14cbcSMatt Macy 				    vdev_raidz_exp2(rows[i][j], log);
1806eda14cbcSMatt Macy 				invrows[ii][j] ^=
1807eda14cbcSMatt Macy 				    vdev_raidz_exp2(invrows[i][j], log);
1808eda14cbcSMatt Macy 			}
1809eda14cbcSMatt Macy 		}
1810eda14cbcSMatt Macy 	}
1811eda14cbcSMatt Macy 
1812eda14cbcSMatt Macy 	/*
1813eda14cbcSMatt Macy 	 * Verify that the data that is left in the rows are properly part of
1814eda14cbcSMatt Macy 	 * an identity matrix.
1815eda14cbcSMatt Macy 	 */
1816eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1817eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1818eda14cbcSMatt Macy 			if (j == missing[i]) {
1819eda14cbcSMatt Macy 				ASSERT3U(rows[i][j], ==, 1);
1820eda14cbcSMatt Macy 			} else {
1821eda14cbcSMatt Macy 				ASSERT0(rows[i][j]);
1822eda14cbcSMatt Macy 			}
1823eda14cbcSMatt Macy 		}
1824eda14cbcSMatt Macy 	}
1825eda14cbcSMatt Macy }
1826eda14cbcSMatt Macy 
1827eda14cbcSMatt Macy static void
18287877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1829eda14cbcSMatt Macy     int *missing, uint8_t **invrows, const uint8_t *used)
1830eda14cbcSMatt Macy {
1831eda14cbcSMatt Macy 	int i, j, x, cc, c;
1832eda14cbcSMatt Macy 	uint8_t *src;
1833eda14cbcSMatt Macy 	uint64_t ccount;
1834eda14cbcSMatt Macy 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1835eda14cbcSMatt Macy 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1836eda14cbcSMatt Macy 	uint8_t log = 0;
1837eda14cbcSMatt Macy 	uint8_t val;
1838eda14cbcSMatt Macy 	int ll;
1839eda14cbcSMatt Macy 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1840eda14cbcSMatt Macy 	uint8_t *p, *pp;
1841eda14cbcSMatt Macy 	size_t psize;
1842eda14cbcSMatt Macy 
1843eda14cbcSMatt Macy 	psize = sizeof (invlog[0][0]) * n * nmissing;
1844eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1845eda14cbcSMatt Macy 
1846eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing; i++) {
1847eda14cbcSMatt Macy 		invlog[i] = pp;
1848eda14cbcSMatt Macy 		pp += n;
1849eda14cbcSMatt Macy 	}
1850eda14cbcSMatt Macy 
1851eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1852eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1853eda14cbcSMatt Macy 			ASSERT3U(invrows[i][j], !=, 0);
1854eda14cbcSMatt Macy 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1855eda14cbcSMatt Macy 		}
1856eda14cbcSMatt Macy 	}
1857eda14cbcSMatt Macy 
1858eda14cbcSMatt Macy 	for (i = 0; i < n; i++) {
1859eda14cbcSMatt Macy 		c = used[i];
18607877fdebSMatt Macy 		ASSERT3U(c, <, rr->rr_cols);
1861eda14cbcSMatt Macy 
18627877fdebSMatt Macy 		ccount = rr->rr_col[c].rc_size;
18637877fdebSMatt Macy 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
18647877fdebSMatt Macy 		if (ccount == 0)
18657877fdebSMatt Macy 			continue;
18667877fdebSMatt Macy 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1867eda14cbcSMatt Macy 		for (j = 0; j < nmissing; j++) {
18687877fdebSMatt Macy 			cc = missing[j] + rr->rr_firstdatacol;
18697877fdebSMatt Macy 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
18707877fdebSMatt Macy 			ASSERT3U(cc, <, rr->rr_cols);
1871eda14cbcSMatt Macy 			ASSERT3U(cc, !=, c);
1872eda14cbcSMatt Macy 
18737877fdebSMatt Macy 			dcount[j] = rr->rr_col[cc].rc_size;
18747877fdebSMatt Macy 			if (dcount[j] != 0)
18757877fdebSMatt Macy 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1876eda14cbcSMatt Macy 		}
1877eda14cbcSMatt Macy 
1878eda14cbcSMatt Macy 		for (x = 0; x < ccount; x++, src++) {
1879eda14cbcSMatt Macy 			if (*src != 0)
1880eda14cbcSMatt Macy 				log = vdev_raidz_log2[*src];
1881eda14cbcSMatt Macy 
1882eda14cbcSMatt Macy 			for (cc = 0; cc < nmissing; cc++) {
1883eda14cbcSMatt Macy 				if (x >= dcount[cc])
1884eda14cbcSMatt Macy 					continue;
1885eda14cbcSMatt Macy 
1886eda14cbcSMatt Macy 				if (*src == 0) {
1887eda14cbcSMatt Macy 					val = 0;
1888eda14cbcSMatt Macy 				} else {
1889eda14cbcSMatt Macy 					if ((ll = log + invlog[cc][i]) >= 255)
1890eda14cbcSMatt Macy 						ll -= 255;
1891eda14cbcSMatt Macy 					val = vdev_raidz_pow2[ll];
1892eda14cbcSMatt Macy 				}
1893eda14cbcSMatt Macy 
1894eda14cbcSMatt Macy 				if (i == 0)
1895eda14cbcSMatt Macy 					dst[cc][x] = val;
1896eda14cbcSMatt Macy 				else
1897eda14cbcSMatt Macy 					dst[cc][x] ^= val;
1898eda14cbcSMatt Macy 			}
1899eda14cbcSMatt Macy 		}
1900eda14cbcSMatt Macy 	}
1901eda14cbcSMatt Macy 
1902eda14cbcSMatt Macy 	kmem_free(p, psize);
1903eda14cbcSMatt Macy }
1904eda14cbcSMatt Macy 
1905f9693befSMartin Matuska static void
19067877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1907eda14cbcSMatt Macy {
1908b985c9caSMartin Matuska 	int i, c, t, tt;
1909b985c9caSMartin Matuska 	unsigned int n;
1910b985c9caSMartin Matuska 	unsigned int nmissing_rows;
1911eda14cbcSMatt Macy 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1912eda14cbcSMatt Macy 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1913eda14cbcSMatt Macy 	uint8_t *p, *pp;
1914eda14cbcSMatt Macy 	size_t psize;
1915eda14cbcSMatt Macy 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1916eda14cbcSMatt Macy 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1917eda14cbcSMatt Macy 	uint8_t *used;
1918eda14cbcSMatt Macy 
1919eda14cbcSMatt Macy 	abd_t **bufs = NULL;
1920eda14cbcSMatt Macy 
1921e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1922e716630dSMartin Matuska 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1923eda14cbcSMatt Macy 	/*
1924eda14cbcSMatt Macy 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
19257877fdebSMatt Macy 	 * temporary linear ABDs if any non-linear ABDs are found.
1926eda14cbcSMatt Macy 	 */
19277877fdebSMatt Macy 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1928e716630dSMartin Matuska 		ASSERT(rr->rr_col[i].rc_abd != NULL);
19297877fdebSMatt Macy 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
19307877fdebSMatt Macy 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
19317877fdebSMatt Macy 			    KM_PUSHPAGE);
1932eda14cbcSMatt Macy 
19337877fdebSMatt Macy 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
19347877fdebSMatt Macy 				raidz_col_t *col = &rr->rr_col[c];
1935eda14cbcSMatt Macy 
1936eda14cbcSMatt Macy 				bufs[c] = col->rc_abd;
19377877fdebSMatt Macy 				if (bufs[c] != NULL) {
19387877fdebSMatt Macy 					col->rc_abd = abd_alloc_linear(
19397877fdebSMatt Macy 					    col->rc_size, B_TRUE);
19407877fdebSMatt Macy 					abd_copy(col->rc_abd, bufs[c],
19417877fdebSMatt Macy 					    col->rc_size);
1942eda14cbcSMatt Macy 				}
1943eda14cbcSMatt Macy 			}
1944eda14cbcSMatt Macy 
19457877fdebSMatt Macy 			break;
19467877fdebSMatt Macy 		}
19477877fdebSMatt Macy 	}
19487877fdebSMatt Macy 
19497877fdebSMatt Macy 	n = rr->rr_cols - rr->rr_firstdatacol;
1950eda14cbcSMatt Macy 
1951eda14cbcSMatt Macy 	/*
1952eda14cbcSMatt Macy 	 * Figure out which data columns are missing.
1953eda14cbcSMatt Macy 	 */
1954eda14cbcSMatt Macy 	nmissing_rows = 0;
1955eda14cbcSMatt Macy 	for (t = 0; t < ntgts; t++) {
19567877fdebSMatt Macy 		if (tgts[t] >= rr->rr_firstdatacol) {
1957eda14cbcSMatt Macy 			missing_rows[nmissing_rows++] =
19587877fdebSMatt Macy 			    tgts[t] - rr->rr_firstdatacol;
1959eda14cbcSMatt Macy 		}
1960eda14cbcSMatt Macy 	}
1961eda14cbcSMatt Macy 
1962eda14cbcSMatt Macy 	/*
1963eda14cbcSMatt Macy 	 * Figure out which parity columns to use to help generate the missing
1964eda14cbcSMatt Macy 	 * data columns.
1965eda14cbcSMatt Macy 	 */
1966eda14cbcSMatt Macy 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1967eda14cbcSMatt Macy 		ASSERT(tt < ntgts);
19687877fdebSMatt Macy 		ASSERT(c < rr->rr_firstdatacol);
1969eda14cbcSMatt Macy 
1970eda14cbcSMatt Macy 		/*
1971eda14cbcSMatt Macy 		 * Skip any targeted parity columns.
1972eda14cbcSMatt Macy 		 */
1973eda14cbcSMatt Macy 		if (c == tgts[tt]) {
1974eda14cbcSMatt Macy 			tt++;
1975eda14cbcSMatt Macy 			continue;
1976eda14cbcSMatt Macy 		}
1977eda14cbcSMatt Macy 
1978eda14cbcSMatt Macy 		parity_map[i] = c;
1979eda14cbcSMatt Macy 		i++;
1980eda14cbcSMatt Macy 	}
1981eda14cbcSMatt Macy 
1982eda14cbcSMatt Macy 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1983eda14cbcSMatt Macy 	    nmissing_rows * n + sizeof (used[0]) * n;
1984eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1985eda14cbcSMatt Macy 
1986eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1987eda14cbcSMatt Macy 		rows[i] = pp;
1988eda14cbcSMatt Macy 		pp += n;
1989eda14cbcSMatt Macy 		invrows[i] = pp;
1990eda14cbcSMatt Macy 		pp += n;
1991eda14cbcSMatt Macy 	}
1992eda14cbcSMatt Macy 	used = pp;
1993eda14cbcSMatt Macy 
1994eda14cbcSMatt Macy 	for (i = 0; i < nmissing_rows; i++) {
1995eda14cbcSMatt Macy 		used[i] = parity_map[i];
1996eda14cbcSMatt Macy 	}
1997eda14cbcSMatt Macy 
19987877fdebSMatt Macy 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1999eda14cbcSMatt Macy 		if (tt < nmissing_rows &&
20007877fdebSMatt Macy 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
2001eda14cbcSMatt Macy 			tt++;
2002eda14cbcSMatt Macy 			continue;
2003eda14cbcSMatt Macy 		}
2004eda14cbcSMatt Macy 
2005eda14cbcSMatt Macy 		ASSERT3S(i, <, n);
2006eda14cbcSMatt Macy 		used[i] = c;
2007eda14cbcSMatt Macy 		i++;
2008eda14cbcSMatt Macy 	}
2009eda14cbcSMatt Macy 
2010eda14cbcSMatt Macy 	/*
2011eda14cbcSMatt Macy 	 * Initialize the interesting rows of the matrix.
2012eda14cbcSMatt Macy 	 */
20137877fdebSMatt Macy 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2014eda14cbcSMatt Macy 
2015eda14cbcSMatt Macy 	/*
2016eda14cbcSMatt Macy 	 * Invert the matrix.
2017eda14cbcSMatt Macy 	 */
20187877fdebSMatt Macy 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2019eda14cbcSMatt Macy 	    invrows, used);
2020eda14cbcSMatt Macy 
2021eda14cbcSMatt Macy 	/*
2022eda14cbcSMatt Macy 	 * Reconstruct the missing data using the generated matrix.
2023eda14cbcSMatt Macy 	 */
20247877fdebSMatt Macy 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2025eda14cbcSMatt Macy 	    invrows, used);
2026eda14cbcSMatt Macy 
2027eda14cbcSMatt Macy 	kmem_free(p, psize);
2028eda14cbcSMatt Macy 
2029eda14cbcSMatt Macy 	/*
2030eda14cbcSMatt Macy 	 * copy back from temporary linear abds and free them
2031eda14cbcSMatt Macy 	 */
2032eda14cbcSMatt Macy 	if (bufs) {
20337877fdebSMatt Macy 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
20347877fdebSMatt Macy 			raidz_col_t *col = &rr->rr_col[c];
2035eda14cbcSMatt Macy 
20367877fdebSMatt Macy 			if (bufs[c] != NULL) {
2037eda14cbcSMatt Macy 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2038eda14cbcSMatt Macy 				abd_free(col->rc_abd);
20397877fdebSMatt Macy 			}
2040eda14cbcSMatt Macy 			col->rc_abd = bufs[c];
2041eda14cbcSMatt Macy 		}
20427877fdebSMatt Macy 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2043eda14cbcSMatt Macy 	}
2044eda14cbcSMatt Macy }
2045eda14cbcSMatt Macy 
2046f9693befSMartin Matuska static void
20477877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
20487877fdebSMatt Macy     const int *t, int nt)
2049eda14cbcSMatt Macy {
2050eda14cbcSMatt Macy 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2051eda14cbcSMatt Macy 	int ntgts;
2052eda14cbcSMatt Macy 	int i, c, ret;
2053eda14cbcSMatt Macy 	int nbadparity, nbaddata;
2054eda14cbcSMatt Macy 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2055eda14cbcSMatt Macy 
2056e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2057e716630dSMartin Matuska 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2058e716630dSMartin Matuska 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2059e716630dSMartin Matuska 		    (int)rr->rr_missingparity);
2060e716630dSMartin Matuska 	}
2061e716630dSMartin Matuska 
20627877fdebSMatt Macy 	nbadparity = rr->rr_firstdatacol;
20637877fdebSMatt Macy 	nbaddata = rr->rr_cols - nbadparity;
2064eda14cbcSMatt Macy 	ntgts = 0;
20657877fdebSMatt Macy 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2066e716630dSMartin Matuska 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2067e716630dSMartin Matuska 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2068e716630dSMartin Matuska 			    "offset=%llx error=%u)",
2069e716630dSMartin Matuska 			    rr, c, (int)rr->rr_col[c].rc_devidx,
2070e716630dSMartin Matuska 			    (long long)rr->rr_col[c].rc_offset,
2071e716630dSMartin Matuska 			    (int)rr->rr_col[c].rc_error);
2072e716630dSMartin Matuska 		}
20737877fdebSMatt Macy 		if (c < rr->rr_firstdatacol)
2074eda14cbcSMatt Macy 			parity_valid[c] = B_FALSE;
2075eda14cbcSMatt Macy 
2076eda14cbcSMatt Macy 		if (i < nt && c == t[i]) {
2077eda14cbcSMatt Macy 			tgts[ntgts++] = c;
2078eda14cbcSMatt Macy 			i++;
20797877fdebSMatt Macy 		} else if (rr->rr_col[c].rc_error != 0) {
2080eda14cbcSMatt Macy 			tgts[ntgts++] = c;
20817877fdebSMatt Macy 		} else if (c >= rr->rr_firstdatacol) {
2082eda14cbcSMatt Macy 			nbaddata--;
2083eda14cbcSMatt Macy 		} else {
2084eda14cbcSMatt Macy 			parity_valid[c] = B_TRUE;
2085eda14cbcSMatt Macy 			nbadparity--;
2086eda14cbcSMatt Macy 		}
2087eda14cbcSMatt Macy 	}
2088eda14cbcSMatt Macy 
2089eda14cbcSMatt Macy 	ASSERT(ntgts >= nt);
2090eda14cbcSMatt Macy 	ASSERT(nbaddata >= 0);
2091eda14cbcSMatt Macy 	ASSERT(nbaddata + nbadparity == ntgts);
2092eda14cbcSMatt Macy 
2093eda14cbcSMatt Macy 	dt = &tgts[nbadparity];
2094eda14cbcSMatt Macy 
2095eda14cbcSMatt Macy 	/* Reconstruct using the new math implementation */
20967877fdebSMatt Macy 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2097eda14cbcSMatt Macy 	if (ret != RAIDZ_ORIGINAL_IMPL)
2098f9693befSMartin Matuska 		return;
2099eda14cbcSMatt Macy 
2100eda14cbcSMatt Macy 	/*
2101eda14cbcSMatt Macy 	 * See if we can use any of our optimized reconstruction routines.
2102eda14cbcSMatt Macy 	 */
2103eda14cbcSMatt Macy 	switch (nbaddata) {
2104eda14cbcSMatt Macy 	case 1:
2105f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_P]) {
2106f9693befSMartin Matuska 			vdev_raidz_reconstruct_p(rr, dt, 1);
2107f9693befSMartin Matuska 			return;
2108f9693befSMartin Matuska 		}
2109eda14cbcSMatt Macy 
21107877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
2111eda14cbcSMatt Macy 
2112f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_Q]) {
2113f9693befSMartin Matuska 			vdev_raidz_reconstruct_q(rr, dt, 1);
2114f9693befSMartin Matuska 			return;
2115f9693befSMartin Matuska 		}
2116eda14cbcSMatt Macy 
21177877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
2118eda14cbcSMatt Macy 		break;
2119eda14cbcSMatt Macy 
2120eda14cbcSMatt Macy 	case 2:
21217877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
2122eda14cbcSMatt Macy 
2123eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_P] &&
2124f9693befSMartin Matuska 		    parity_valid[VDEV_RAIDZ_Q]) {
2125f9693befSMartin Matuska 			vdev_raidz_reconstruct_pq(rr, dt, 2);
2126f9693befSMartin Matuska 			return;
2127f9693befSMartin Matuska 		}
2128eda14cbcSMatt Macy 
21297877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
2130eda14cbcSMatt Macy 
2131eda14cbcSMatt Macy 		break;
2132eda14cbcSMatt Macy 	}
2133eda14cbcSMatt Macy 
2134f9693befSMartin Matuska 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2135eda14cbcSMatt Macy }
2136eda14cbcSMatt Macy 
2137eda14cbcSMatt Macy static int
2138eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2139eda14cbcSMatt Macy     uint64_t *logical_ashift, uint64_t *physical_ashift)
2140eda14cbcSMatt Macy {
21417877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
21427877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
2143eda14cbcSMatt Macy 	int c;
2144eda14cbcSMatt Macy 	int lasterror = 0;
2145eda14cbcSMatt Macy 	int numerrors = 0;
2146eda14cbcSMatt Macy 
2147eda14cbcSMatt Macy 	ASSERT(nparity > 0);
2148eda14cbcSMatt Macy 
2149eda14cbcSMatt Macy 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2150eda14cbcSMatt Macy 	    vd->vdev_children < nparity + 1) {
2151eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2152eda14cbcSMatt Macy 		return (SET_ERROR(EINVAL));
2153eda14cbcSMatt Macy 	}
2154eda14cbcSMatt Macy 
2155eda14cbcSMatt Macy 	vdev_open_children(vd);
2156eda14cbcSMatt Macy 
2157eda14cbcSMatt Macy 	for (c = 0; c < vd->vdev_children; c++) {
21587877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[c];
2159eda14cbcSMatt Macy 
2160eda14cbcSMatt Macy 		if (cvd->vdev_open_error != 0) {
2161eda14cbcSMatt Macy 			lasterror = cvd->vdev_open_error;
2162eda14cbcSMatt Macy 			numerrors++;
2163eda14cbcSMatt Macy 			continue;
2164eda14cbcSMatt Macy 		}
2165eda14cbcSMatt Macy 
2166eda14cbcSMatt Macy 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2167eda14cbcSMatt Macy 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2168eda14cbcSMatt Macy 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2169c7046f76SMartin Matuska 	}
2170c7046f76SMartin Matuska 	for (c = 0; c < vd->vdev_children; c++) {
2171c7046f76SMartin Matuska 		vdev_t *cvd = vd->vdev_child[c];
2172c7046f76SMartin Matuska 
2173c7046f76SMartin Matuska 		if (cvd->vdev_open_error != 0)
2174c7046f76SMartin Matuska 			continue;
2175c7046f76SMartin Matuska 		*physical_ashift = vdev_best_ashift(*logical_ashift,
2176c7046f76SMartin Matuska 		    *physical_ashift, cvd->vdev_physical_ashift);
2177eda14cbcSMatt Macy 	}
2178eda14cbcSMatt Macy 
2179e716630dSMartin Matuska 	if (vd->vdev_rz_expanding) {
2180e716630dSMartin Matuska 		*asize *= vd->vdev_children - 1;
2181e716630dSMartin Matuska 		*max_asize *= vd->vdev_children - 1;
2182e716630dSMartin Matuska 
2183e716630dSMartin Matuska 		vd->vdev_min_asize = *asize;
2184e716630dSMartin Matuska 	} else {
2185eda14cbcSMatt Macy 		*asize *= vd->vdev_children;
2186eda14cbcSMatt Macy 		*max_asize *= vd->vdev_children;
2187e716630dSMartin Matuska 	}
2188eda14cbcSMatt Macy 
2189eda14cbcSMatt Macy 	if (numerrors > nparity) {
2190eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2191eda14cbcSMatt Macy 		return (lasterror);
2192eda14cbcSMatt Macy 	}
2193eda14cbcSMatt Macy 
2194eda14cbcSMatt Macy 	return (0);
2195eda14cbcSMatt Macy }
2196eda14cbcSMatt Macy 
2197eda14cbcSMatt Macy static void
2198eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd)
2199eda14cbcSMatt Macy {
22007877fdebSMatt Macy 	for (int c = 0; c < vd->vdev_children; c++) {
22017877fdebSMatt Macy 		if (vd->vdev_child[c] != NULL)
2202eda14cbcSMatt Macy 			vdev_close(vd->vdev_child[c]);
2203eda14cbcSMatt Macy 	}
22047877fdebSMatt Macy }
2205eda14cbcSMatt Macy 
2206e716630dSMartin Matuska /*
2207e716630dSMartin Matuska  * Return the logical width to use, given the txg in which the allocation
2208783d3ff6SMartin Matuska  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2209e716630dSMartin Matuska  * BP was allocated.  Remapped BP's (that were relocated due to device
2210783d3ff6SMartin Matuska  * removal, see remap_blkptr_cb()), will have a more recent physical birth
2211783d3ff6SMartin Matuska  * which reflects when the BP was relocated, but we can ignore these because
2212783d3ff6SMartin Matuska  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2213e716630dSMartin Matuska  */
2214eda14cbcSMatt Macy static uint64_t
2215e716630dSMartin Matuska vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2216e716630dSMartin Matuska {
2217e716630dSMartin Matuska 	reflow_node_t lookup = {
2218e716630dSMartin Matuska 		.re_txg = txg,
2219e716630dSMartin Matuska 	};
2220e716630dSMartin Matuska 	avl_index_t where;
2221e716630dSMartin Matuska 
2222e716630dSMartin Matuska 	uint64_t width;
2223e716630dSMartin Matuska 	mutex_enter(&vdrz->vd_expand_lock);
2224e716630dSMartin Matuska 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2225e716630dSMartin Matuska 	if (re != NULL) {
2226e716630dSMartin Matuska 		width = re->re_logical_width;
2227e716630dSMartin Matuska 	} else {
2228e716630dSMartin Matuska 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2229e716630dSMartin Matuska 		if (re != NULL)
2230e716630dSMartin Matuska 			width = re->re_logical_width;
2231e716630dSMartin Matuska 		else
2232e716630dSMartin Matuska 			width = vdrz->vd_original_width;
2233e716630dSMartin Matuska 	}
2234e716630dSMartin Matuska 	mutex_exit(&vdrz->vd_expand_lock);
2235e716630dSMartin Matuska 	return (width);
2236e716630dSMartin Matuska }
2237e716630dSMartin Matuska 
2238e716630dSMartin Matuska /*
2239e716630dSMartin Matuska  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2240e716630dSMartin Matuska  * more space due to the lower data-to-parity ratio.  In this case it's
2241e716630dSMartin Matuska  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2242e716630dSMartin Matuska  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2243e716630dSMartin Matuska  * regardless of txg.  This is assured because for a single data sector, we
2244e716630dSMartin Matuska  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2245e716630dSMartin Matuska  */
2246e716630dSMartin Matuska static uint64_t
2247e716630dSMartin Matuska vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2248eda14cbcSMatt Macy {
22497877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2250eda14cbcSMatt Macy 	uint64_t asize;
2251eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2252e716630dSMartin Matuska 	uint64_t cols = vdrz->vd_original_width;
22537877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
2254eda14cbcSMatt Macy 
2255e716630dSMartin Matuska 	cols = vdev_raidz_get_logical_width(vdrz, txg);
2256e716630dSMartin Matuska 
2257eda14cbcSMatt Macy 	asize = ((psize - 1) >> ashift) + 1;
2258eda14cbcSMatt Macy 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2259eda14cbcSMatt Macy 	asize = roundup(asize, nparity + 1) << ashift;
2260eda14cbcSMatt Macy 
2261e716630dSMartin Matuska #ifdef ZFS_DEBUG
2262e716630dSMartin Matuska 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2263e716630dSMartin Matuska 	uint64_t ncols_new = vdrz->vd_physical_width;
2264e716630dSMartin Matuska 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2265e716630dSMartin Matuska 	    (ncols_new - nparity));
2266e716630dSMartin Matuska 	asize_new = roundup(asize_new, nparity + 1) << ashift;
2267e716630dSMartin Matuska 	VERIFY3U(asize_new, <=, asize);
2268e716630dSMartin Matuska #endif
2269e716630dSMartin Matuska 
2270eda14cbcSMatt Macy 	return (asize);
2271eda14cbcSMatt Macy }
2272eda14cbcSMatt Macy 
22737877fdebSMatt Macy /*
22747877fdebSMatt Macy  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
22757877fdebSMatt Macy  * so each child must provide at least 1/Nth of its asize.
22767877fdebSMatt Macy  */
22777877fdebSMatt Macy static uint64_t
22787877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd)
22797877fdebSMatt Macy {
22807877fdebSMatt Macy 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
22817877fdebSMatt Macy 	    vd->vdev_children);
22827877fdebSMatt Macy }
22837877fdebSMatt Macy 
22847877fdebSMatt Macy void
2285eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio)
2286eda14cbcSMatt Macy {
2287eda14cbcSMatt Macy 	raidz_col_t *rc = zio->io_private;
2288eda14cbcSMatt Macy 
228981b22a98SMartin Matuska 	ASSERT3P(rc->rc_abd, !=, NULL);
2290eda14cbcSMatt Macy 	rc->rc_error = zio->io_error;
2291eda14cbcSMatt Macy 	rc->rc_tried = 1;
2292eda14cbcSMatt Macy 	rc->rc_skipped = 0;
2293eda14cbcSMatt Macy }
2294eda14cbcSMatt Macy 
2295eda14cbcSMatt Macy static void
2296e716630dSMartin Matuska vdev_raidz_shadow_child_done(zio_t *zio)
2297eda14cbcSMatt Macy {
2298e716630dSMartin Matuska 	raidz_col_t *rc = zio->io_private;
2299eda14cbcSMatt Macy 
2300e716630dSMartin Matuska 	rc->rc_shadow_error = zio->io_error;
2301e716630dSMartin Matuska }
2302e716630dSMartin Matuska 
2303e716630dSMartin Matuska static void
2304e716630dSMartin Matuska vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2305e716630dSMartin Matuska {
2306e716630dSMartin Matuska 	(void) rm;
2307e716630dSMartin Matuska #ifdef ZFS_DEBUG
23087877fdebSMatt Macy 	range_seg64_t logical_rs, physical_rs, remain_rs;
23097877fdebSMatt Macy 	logical_rs.rs_start = rr->rr_offset;
2310eda14cbcSMatt Macy 	logical_rs.rs_end = logical_rs.rs_start +
2311e716630dSMartin Matuska 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
2312783d3ff6SMartin Matuska 	    BP_GET_BIRTH(zio->io_bp));
2313eda14cbcSMatt Macy 
23147877fdebSMatt Macy 	raidz_col_t *rc = &rr->rr_col[col];
2315e716630dSMartin Matuska 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2316eda14cbcSMatt Macy 
23177877fdebSMatt Macy 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
23187877fdebSMatt Macy 	ASSERT(vdev_xlate_is_empty(&remain_rs));
2319e716630dSMartin Matuska 	if (vdev_xlate_is_empty(&physical_rs)) {
2320e716630dSMartin Matuska 		/*
2321e716630dSMartin Matuska 		 * If we are in the middle of expansion, the
2322e716630dSMartin Matuska 		 * physical->logical mapping is changing so vdev_xlate()
2323e716630dSMartin Matuska 		 * can't give us a reliable answer.
2324e716630dSMartin Matuska 		 */
2325e716630dSMartin Matuska 		return;
2326e716630dSMartin Matuska 	}
2327eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2328eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2329eda14cbcSMatt Macy 	/*
2330eda14cbcSMatt Macy 	 * It would be nice to assert that rs_end is equal
2331eda14cbcSMatt Macy 	 * to rc_offset + rc_size but there might be an
2332eda14cbcSMatt Macy 	 * optional I/O at the end that is not accounted in
2333eda14cbcSMatt Macy 	 * rc_size.
2334eda14cbcSMatt Macy 	 */
2335eda14cbcSMatt Macy 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2336eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2337e716630dSMartin Matuska 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2338eda14cbcSMatt Macy 	} else {
2339eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2340eda14cbcSMatt Macy 	}
2341eda14cbcSMatt Macy #endif
2342eda14cbcSMatt Macy }
2343eda14cbcSMatt Macy 
23447877fdebSMatt Macy static void
2345e716630dSMartin Matuska vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
23467877fdebSMatt Macy {
23477877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
23487877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
23497877fdebSMatt Macy 
23507877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
23517877fdebSMatt Macy 
235281b22a98SMartin Matuska 	for (int c = 0; c < rr->rr_scols; c++) {
23537877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
235481b22a98SMartin Matuska 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
23557877fdebSMatt Macy 
23567877fdebSMatt Macy 		/* Verify physical to logical translation */
2357e716630dSMartin Matuska 		vdev_raidz_io_verify(zio, rm, rr, c);
23587877fdebSMatt Macy 
2359e716630dSMartin Matuska 		if (rc->rc_size == 0)
2360e716630dSMartin Matuska 			continue;
2361e716630dSMartin Matuska 
2362e716630dSMartin Matuska 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2363e716630dSMartin Matuska 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2364e716630dSMartin Matuska 
236581b22a98SMartin Matuska 		ASSERT3P(rc->rc_abd, !=, NULL);
23667877fdebSMatt Macy 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
236781b22a98SMartin Matuska 		    rc->rc_offset, rc->rc_abd,
236881b22a98SMartin Matuska 		    abd_get_size(rc->rc_abd), zio->io_type,
236981b22a98SMartin Matuska 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2370e716630dSMartin Matuska 
2371e716630dSMartin Matuska 		if (rc->rc_shadow_devidx != INT_MAX) {
2372e716630dSMartin Matuska 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2373e716630dSMartin Matuska 
2374e716630dSMartin Matuska 			ASSERT3U(
2375e716630dSMartin Matuska 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2376e716630dSMartin Matuska 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2377e716630dSMartin Matuska 
2378e716630dSMartin Matuska 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2379e716630dSMartin Matuska 			    rc->rc_shadow_offset, rc->rc_abd,
2380e716630dSMartin Matuska 			    abd_get_size(rc->rc_abd),
2381e716630dSMartin Matuska 			    zio->io_type, zio->io_priority, 0,
2382e716630dSMartin Matuska 			    vdev_raidz_shadow_child_done, rc));
238381b22a98SMartin Matuska 		}
23847877fdebSMatt Macy 	}
23857877fdebSMatt Macy }
23867877fdebSMatt Macy 
2387e716630dSMartin Matuska /*
2388e716630dSMartin Matuska  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2389e716630dSMartin Matuska  * This only works for vdev_raidz_map_alloc() (not _expanded()).
2390e716630dSMartin Matuska  */
23917877fdebSMatt Macy static void
2392e716630dSMartin Matuska raidz_start_skip_writes(zio_t *zio)
2393e716630dSMartin Matuska {
2394e716630dSMartin Matuska 	vdev_t *vd = zio->io_vd;
2395e716630dSMartin Matuska 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2396e716630dSMartin Matuska 	raidz_map_t *rm = zio->io_vsd;
2397e716630dSMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
2398e716630dSMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
2399e716630dSMartin Matuska 	for (int c = 0; c < rr->rr_scols; c++) {
2400e716630dSMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
2401e716630dSMartin Matuska 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2402e716630dSMartin Matuska 		if (rc->rc_size != 0)
2403e716630dSMartin Matuska 			continue;
2404e716630dSMartin Matuska 		ASSERT3P(rc->rc_abd, ==, NULL);
2405e716630dSMartin Matuska 
2406e716630dSMartin Matuska 		ASSERT3U(rc->rc_offset, <,
2407e716630dSMartin Matuska 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2408e716630dSMartin Matuska 
2409e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2410e716630dSMartin Matuska 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2411e716630dSMartin Matuska 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2412e716630dSMartin Matuska 	}
2413e716630dSMartin Matuska }
2414e716630dSMartin Matuska 
2415e716630dSMartin Matuska static void
2416e716630dSMartin Matuska vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
24177877fdebSMatt Macy {
24187877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
24197877fdebSMatt Macy 
24207877fdebSMatt Macy 	/*
24217877fdebSMatt Macy 	 * Iterate over the columns in reverse order so that we hit the parity
24227877fdebSMatt Macy 	 * last -- any errors along the way will force us to read the parity.
24237877fdebSMatt Macy 	 */
24247877fdebSMatt Macy 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
24257877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
24267877fdebSMatt Macy 		if (rc->rc_size == 0)
24277877fdebSMatt Macy 			continue;
24287877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
24297877fdebSMatt Macy 		if (!vdev_readable(cvd)) {
24307877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
24317877fdebSMatt Macy 				rr->rr_missingdata++;
24327877fdebSMatt Macy 			else
24337877fdebSMatt Macy 				rr->rr_missingparity++;
24347877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ENXIO);
24357877fdebSMatt Macy 			rc->rc_tried = 1;	/* don't even try */
24367877fdebSMatt Macy 			rc->rc_skipped = 1;
24377877fdebSMatt Macy 			continue;
24387877fdebSMatt Macy 		}
24397877fdebSMatt Macy 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
24407877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
24417877fdebSMatt Macy 				rr->rr_missingdata++;
24427877fdebSMatt Macy 			else
24437877fdebSMatt Macy 				rr->rr_missingparity++;
24447877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ESTALE);
24457877fdebSMatt Macy 			rc->rc_skipped = 1;
24467877fdebSMatt Macy 			continue;
24477877fdebSMatt Macy 		}
2448e716630dSMartin Matuska 		if (forceparity ||
2449e716630dSMartin Matuska 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
24507877fdebSMatt Macy 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
24517877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
24527877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
24537877fdebSMatt Macy 			    zio->io_type, zio->io_priority, 0,
24547877fdebSMatt Macy 			    vdev_raidz_child_done, rc));
24557877fdebSMatt Macy 		}
24567877fdebSMatt Macy 	}
24577877fdebSMatt Macy }
24587877fdebSMatt Macy 
2459e716630dSMartin Matuska static void
2460e716630dSMartin Matuska vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2461e716630dSMartin Matuska {
2462e716630dSMartin Matuska 	vdev_t *vd = zio->io_vd;
2463e716630dSMartin Matuska 
2464e716630dSMartin Matuska 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2465e716630dSMartin Matuska 		raidz_col_t *prc = &rm->rm_phys_col[i];
2466e716630dSMartin Matuska 		if (prc->rc_size == 0)
2467e716630dSMartin Matuska 			continue;
2468e716630dSMartin Matuska 
2469e716630dSMartin Matuska 		ASSERT3U(prc->rc_devidx, ==, i);
2470e716630dSMartin Matuska 		vdev_t *cvd = vd->vdev_child[i];
2471e716630dSMartin Matuska 		if (!vdev_readable(cvd)) {
2472e716630dSMartin Matuska 			prc->rc_error = SET_ERROR(ENXIO);
2473e716630dSMartin Matuska 			prc->rc_tried = 1;	/* don't even try */
2474e716630dSMartin Matuska 			prc->rc_skipped = 1;
2475e716630dSMartin Matuska 			continue;
2476e716630dSMartin Matuska 		}
2477e716630dSMartin Matuska 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2478e716630dSMartin Matuska 			prc->rc_error = SET_ERROR(ESTALE);
2479e716630dSMartin Matuska 			prc->rc_skipped = 1;
2480e716630dSMartin Matuska 			continue;
2481e716630dSMartin Matuska 		}
2482e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2483e716630dSMartin Matuska 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2484e716630dSMartin Matuska 		    zio->io_type, zio->io_priority, 0,
2485e716630dSMartin Matuska 		    vdev_raidz_child_done, prc));
2486e716630dSMartin Matuska 	}
2487e716630dSMartin Matuska }
2488e716630dSMartin Matuska 
2489e716630dSMartin Matuska static void
2490e716630dSMartin Matuska vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2491e716630dSMartin Matuska {
2492e716630dSMartin Matuska 	/*
2493e716630dSMartin Matuska 	 * If there are multiple rows, we will be hitting
2494e716630dSMartin Matuska 	 * all disks, so go ahead and read the parity so
2495e716630dSMartin Matuska 	 * that we are reading in decent size chunks.
2496e716630dSMartin Matuska 	 */
2497e716630dSMartin Matuska 	boolean_t forceparity = rm->rm_nrows > 1;
2498e716630dSMartin Matuska 
2499e716630dSMartin Matuska 	if (rm->rm_phys_col) {
2500e716630dSMartin Matuska 		vdev_raidz_io_start_read_phys_cols(zio, rm);
2501e716630dSMartin Matuska 	} else {
2502e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nrows; i++) {
2503e716630dSMartin Matuska 			raidz_row_t *rr = rm->rm_row[i];
2504e716630dSMartin Matuska 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2505e716630dSMartin Matuska 		}
2506e716630dSMartin Matuska 	}
2507e716630dSMartin Matuska }
2508e716630dSMartin Matuska 
2509eda14cbcSMatt Macy /*
2510eda14cbcSMatt Macy  * Start an IO operation on a RAIDZ VDev
2511eda14cbcSMatt Macy  *
2512eda14cbcSMatt Macy  * Outline:
2513eda14cbcSMatt Macy  * - For write operations:
2514eda14cbcSMatt Macy  *   1. Generate the parity data
2515eda14cbcSMatt Macy  *   2. Create child zio write operations to each column's vdev, for both
2516eda14cbcSMatt Macy  *      data and parity.
2517eda14cbcSMatt Macy  *   3. If the column skips any sectors for padding, create optional dummy
2518eda14cbcSMatt Macy  *      write zio children for those areas to improve aggregation continuity.
2519eda14cbcSMatt Macy  * - For read operations:
2520eda14cbcSMatt Macy  *   1. Create child zio read operations to each data column's vdev to read
2521eda14cbcSMatt Macy  *      the range of data required for zio.
2522eda14cbcSMatt Macy  *   2. If this is a scrub or resilver operation, or if any of the data
2523eda14cbcSMatt Macy  *      vdevs have had errors, then create zio read operations to the parity
2524eda14cbcSMatt Macy  *      columns' VDevs as well.
2525eda14cbcSMatt Macy  */
2526eda14cbcSMatt Macy static void
2527eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio)
2528eda14cbcSMatt Macy {
2529eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
2530eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
25317877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2532e716630dSMartin Matuska 	raidz_map_t *rm;
2533eda14cbcSMatt Macy 
2534e716630dSMartin Matuska 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2535783d3ff6SMartin Matuska 	    BP_GET_BIRTH(zio->io_bp));
2536e716630dSMartin Matuska 	if (logical_width != vdrz->vd_physical_width) {
2537e716630dSMartin Matuska 		zfs_locked_range_t *lr = NULL;
2538e716630dSMartin Matuska 		uint64_t synced_offset = UINT64_MAX;
2539e716630dSMartin Matuska 		uint64_t next_offset = UINT64_MAX;
2540e716630dSMartin Matuska 		boolean_t use_scratch = B_FALSE;
2541e716630dSMartin Matuska 		/*
2542e716630dSMartin Matuska 		 * Note: when the expansion is completing, we set
2543e716630dSMartin Matuska 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2544e716630dSMartin Matuska 		 * in a later txg than when we last update spa_ubsync's state
2545e716630dSMartin Matuska 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2546e716630dSMartin Matuska 		 * may see vre_state!=SCANNING before
2547e716630dSMartin Matuska 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2548e716630dSMartin Matuska 		 * on disk, but the copying progress has been synced to disk
2549e716630dSMartin Matuska 		 * (and reflected in spa_ubsync).  In this case it's fine to
2550e716630dSMartin Matuska 		 * treat the expansion as completed, since if we crash there's
2551e716630dSMartin Matuska 		 * no additional copying to do.
2552e716630dSMartin Matuska 		 */
2553e716630dSMartin Matuska 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2554e716630dSMartin Matuska 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2555e716630dSMartin Matuska 			    &vdrz->vn_vre);
2556e716630dSMartin Matuska 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2557e716630dSMartin Matuska 			    zio->io_offset, zio->io_size, RL_READER);
2558e716630dSMartin Matuska 			use_scratch =
2559e716630dSMartin Matuska 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2560e716630dSMartin Matuska 			    RRSS_SCRATCH_VALID);
2561e716630dSMartin Matuska 			synced_offset =
2562e716630dSMartin Matuska 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2563e716630dSMartin Matuska 			next_offset = vdrz->vn_vre.vre_offset;
2564e716630dSMartin Matuska 			/*
2565e716630dSMartin Matuska 			 * If we haven't resumed expanding since importing the
2566e716630dSMartin Matuska 			 * pool, vre_offset won't have been set yet.  In
2567e716630dSMartin Matuska 			 * this case the next offset to be copied is the same
2568e716630dSMartin Matuska 			 * as what was synced.
2569e716630dSMartin Matuska 			 */
2570e716630dSMartin Matuska 			if (next_offset == UINT64_MAX) {
2571e716630dSMartin Matuska 				next_offset = synced_offset;
2572e716630dSMartin Matuska 			}
2573e716630dSMartin Matuska 		}
2574e716630dSMartin Matuska 		if (use_scratch) {
2575e716630dSMartin Matuska 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2576e716630dSMartin Matuska 			    "%lld next_offset=%lld use_scratch=%u",
2577e716630dSMartin Matuska 			    zio,
2578e716630dSMartin Matuska 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2579e716630dSMartin Matuska 			    (long long)zio->io_offset,
2580e716630dSMartin Matuska 			    (long long)synced_offset,
2581e716630dSMartin Matuska 			    (long long)next_offset,
2582e716630dSMartin Matuska 			    use_scratch);
2583e716630dSMartin Matuska 		}
2584e716630dSMartin Matuska 
2585e716630dSMartin Matuska 		rm = vdev_raidz_map_alloc_expanded(zio,
2586e716630dSMartin Matuska 		    tvd->vdev_ashift, vdrz->vd_physical_width,
2587e716630dSMartin Matuska 		    logical_width, vdrz->vd_nparity,
2588e716630dSMartin Matuska 		    synced_offset, next_offset, use_scratch);
2589e716630dSMartin Matuska 		rm->rm_lr = lr;
2590e716630dSMartin Matuska 	} else {
2591e716630dSMartin Matuska 		rm = vdev_raidz_map_alloc(zio,
2592e716630dSMartin Matuska 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2593e716630dSMartin Matuska 	}
2594e716630dSMartin Matuska 	rm->rm_original_width = vdrz->vd_original_width;
2595e716630dSMartin Matuska 
2596f9693befSMartin Matuska 	zio->io_vsd = rm;
2597f9693befSMartin Matuska 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2598eda14cbcSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
2599e716630dSMartin Matuska 		for (int i = 0; i < rm->rm_nrows; i++) {
2600e716630dSMartin Matuska 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2601e716630dSMartin Matuska 		}
2602e716630dSMartin Matuska 
2603e716630dSMartin Matuska 		if (logical_width == vdrz->vd_physical_width) {
2604e716630dSMartin Matuska 			raidz_start_skip_writes(zio);
2605e716630dSMartin Matuska 		}
26067877fdebSMatt Macy 	} else {
2607eda14cbcSMatt Macy 		ASSERT(zio->io_type == ZIO_TYPE_READ);
2608e716630dSMartin Matuska 		vdev_raidz_io_start_read(zio, rm);
2609eda14cbcSMatt Macy 	}
2610eda14cbcSMatt Macy 
2611eda14cbcSMatt Macy 	zio_execute(zio);
2612eda14cbcSMatt Macy }
2613eda14cbcSMatt Macy 
2614eda14cbcSMatt Macy /*
2615eda14cbcSMatt Macy  * Report a checksum error for a child of a RAID-Z device.
2616eda14cbcSMatt Macy  */
2617e92ffd9bSMartin Matuska void
2618e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2619eda14cbcSMatt Macy {
2620eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2621eda14cbcSMatt Macy 
26227877fdebSMatt Macy 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
26237877fdebSMatt Macy 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2624eda14cbcSMatt Macy 		zio_bad_cksum_t zbc;
2625eda14cbcSMatt Macy 		raidz_map_t *rm = zio->io_vsd;
2626eda14cbcSMatt Macy 
2627eda14cbcSMatt Macy 		zbc.zbc_has_cksum = 0;
2628eda14cbcSMatt Macy 		zbc.zbc_injected = rm->rm_ecksuminjected;
2629eda14cbcSMatt Macy 
26302c48331dSMatt Macy 		mutex_enter(&vd->vdev_stat_lock);
26312c48331dSMatt Macy 		vd->vdev_stat.vs_checksum_errors++;
26322c48331dSMatt Macy 		mutex_exit(&vd->vdev_stat_lock);
2633bb2d13b6SMartin Matuska 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2634bb2d13b6SMartin Matuska 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2635bb2d13b6SMartin Matuska 		    rc->rc_abd, bad_data, &zbc);
26362c48331dSMatt Macy 	}
2637eda14cbcSMatt Macy }
2638eda14cbcSMatt Macy 
2639eda14cbcSMatt Macy /*
2640eda14cbcSMatt Macy  * We keep track of whether or not there were any injected errors, so that
2641eda14cbcSMatt Macy  * any ereports we generate can note it.
2642eda14cbcSMatt Macy  */
2643eda14cbcSMatt Macy static int
2644eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
2645eda14cbcSMatt Macy {
2646315ee00fSMartin Matuska 	zio_bad_cksum_t zbc = {0};
2647eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2648eda14cbcSMatt Macy 
2649eda14cbcSMatt Macy 	int ret = zio_checksum_error(zio, &zbc);
265087bf66d4SMartin Matuska 	/*
265187bf66d4SMartin Matuska 	 * Any Direct I/O read that has a checksum error must be treated as
265287bf66d4SMartin Matuska 	 * suspicious as the contents of the buffer could be getting
265387bf66d4SMartin Matuska 	 * manipulated while the I/O is taking place. The checksum verify error
265487bf66d4SMartin Matuska 	 * will be reported to the top-level RAIDZ VDEV.
265587bf66d4SMartin Matuska 	 */
265687bf66d4SMartin Matuska 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
265787bf66d4SMartin Matuska 		zio->io_error = ret;
265887bf66d4SMartin Matuska 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
265987bf66d4SMartin Matuska 		zio_dio_chksum_verify_error_report(zio);
266087bf66d4SMartin Matuska 		zio_checksum_verified(zio);
266187bf66d4SMartin Matuska 		return (0);
266287bf66d4SMartin Matuska 	}
266387bf66d4SMartin Matuska 
2664eda14cbcSMatt Macy 	if (ret != 0 && zbc.zbc_injected != 0)
2665eda14cbcSMatt Macy 		rm->rm_ecksuminjected = 1;
2666eda14cbcSMatt Macy 
2667eda14cbcSMatt Macy 	return (ret);
2668eda14cbcSMatt Macy }
2669eda14cbcSMatt Macy 
2670eda14cbcSMatt Macy /*
2671eda14cbcSMatt Macy  * Generate the parity from the data columns. If we tried and were able to
2672eda14cbcSMatt Macy  * read the parity without error, verify that the generated parity matches the
2673eda14cbcSMatt Macy  * data we read. If it doesn't, we fire off a checksum error. Return the
26747877fdebSMatt Macy  * number of such failures.
2675eda14cbcSMatt Macy  */
2676eda14cbcSMatt Macy static int
26777877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2678eda14cbcSMatt Macy {
2679eda14cbcSMatt Macy 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2680eda14cbcSMatt Macy 	int c, ret = 0;
26817877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2682eda14cbcSMatt Macy 	raidz_col_t *rc;
2683eda14cbcSMatt Macy 
2684eda14cbcSMatt Macy 	blkptr_t *bp = zio->io_bp;
2685eda14cbcSMatt Macy 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2686eda14cbcSMatt Macy 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2687eda14cbcSMatt Macy 
2688eda14cbcSMatt Macy 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2689eda14cbcSMatt Macy 		return (ret);
2690eda14cbcSMatt Macy 
26917877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
26927877fdebSMatt Macy 		rc = &rr->rr_col[c];
2693eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
2694eda14cbcSMatt Macy 			continue;
2695eda14cbcSMatt Macy 
2696a0b956f5SMartin Matuska 		orig[c] = rc->rc_abd;
2697a0b956f5SMartin Matuska 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2698a0b956f5SMartin Matuska 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2699eda14cbcSMatt Macy 	}
2700eda14cbcSMatt Macy 
27017877fdebSMatt Macy 	/*
2702e92ffd9bSMartin Matuska 	 * Verify any empty sectors are zero filled to ensure the parity
2703e92ffd9bSMartin Matuska 	 * is calculated correctly even if these non-data sectors are damaged.
2704e92ffd9bSMartin Matuska 	 */
2705e92ffd9bSMartin Matuska 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2706e92ffd9bSMartin Matuska 		ret += vdev_draid_map_verify_empty(zio, rr);
2707e92ffd9bSMartin Matuska 
2708e92ffd9bSMartin Matuska 	/*
27097877fdebSMatt Macy 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
27107877fdebSMatt Macy 	 * isn't harmful but it does have the side effect of fixing stuff
27117877fdebSMatt Macy 	 * we didn't realize was necessary (i.e. even if we return 0).
27127877fdebSMatt Macy 	 */
27137877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
2714eda14cbcSMatt Macy 
27157877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
27167877fdebSMatt Macy 		rc = &rr->rr_col[c];
27177877fdebSMatt Macy 
2718eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
2719eda14cbcSMatt Macy 			continue;
27207877fdebSMatt Macy 
2721eda14cbcSMatt Macy 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2722e716630dSMartin Matuska 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2723e716630dSMartin Matuska 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2724e92ffd9bSMartin Matuska 			vdev_raidz_checksum_error(zio, rc, orig[c]);
2725eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ECKSUM);
2726eda14cbcSMatt Macy 			ret++;
2727eda14cbcSMatt Macy 		}
2728eda14cbcSMatt Macy 		abd_free(orig[c]);
2729eda14cbcSMatt Macy 	}
2730eda14cbcSMatt Macy 
2731eda14cbcSMatt Macy 	return (ret);
2732eda14cbcSMatt Macy }
2733eda14cbcSMatt Macy 
2734eda14cbcSMatt Macy static int
27357877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr)
2736eda14cbcSMatt Macy {
2737eda14cbcSMatt Macy 	int error = 0;
2738eda14cbcSMatt Macy 
2739e716630dSMartin Matuska 	for (int c = 0; c < rr->rr_cols; c++) {
27407877fdebSMatt Macy 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2741e716630dSMartin Matuska 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2742e716630dSMartin Matuska 	}
2743eda14cbcSMatt Macy 
2744eda14cbcSMatt Macy 	return (error);
2745eda14cbcSMatt Macy }
2746eda14cbcSMatt Macy 
2747eda14cbcSMatt Macy static void
27487877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2749eda14cbcSMatt Macy {
2750eda14cbcSMatt Macy 	int unexpected_errors = 0;
2751eda14cbcSMatt Macy 	int parity_errors = 0;
2752eda14cbcSMatt Macy 	int parity_untried = 0;
2753eda14cbcSMatt Macy 	int data_errors = 0;
2754eda14cbcSMatt Macy 
27557877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2756eda14cbcSMatt Macy 
27577877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
27587877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
2759eda14cbcSMatt Macy 
2760eda14cbcSMatt Macy 		if (rc->rc_error) {
27617877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
2762eda14cbcSMatt Macy 				parity_errors++;
2763eda14cbcSMatt Macy 			else
2764eda14cbcSMatt Macy 				data_errors++;
2765eda14cbcSMatt Macy 
2766eda14cbcSMatt Macy 			if (!rc->rc_skipped)
2767eda14cbcSMatt Macy 				unexpected_errors++;
27687877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2769eda14cbcSMatt Macy 			parity_untried++;
2770eda14cbcSMatt Macy 		}
2771a0b956f5SMartin Matuska 
2772a0b956f5SMartin Matuska 		if (rc->rc_force_repair)
2773a0b956f5SMartin Matuska 			unexpected_errors++;
2774eda14cbcSMatt Macy 	}
2775eda14cbcSMatt Macy 
2776eda14cbcSMatt Macy 	/*
27777877fdebSMatt Macy 	 * If we read more parity disks than were used for
27787877fdebSMatt Macy 	 * reconstruction, confirm that the other parity disks produced
27797877fdebSMatt Macy 	 * correct data.
27807877fdebSMatt Macy 	 *
27817877fdebSMatt Macy 	 * Note that we also regenerate parity when resilvering so we
27827877fdebSMatt Macy 	 * can write it out to failed devices later.
27837877fdebSMatt Macy 	 */
27847877fdebSMatt Macy 	if (parity_errors + parity_untried <
27857877fdebSMatt Macy 	    rr->rr_firstdatacol - data_errors ||
27867877fdebSMatt Macy 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
27877877fdebSMatt Macy 		int n = raidz_parity_verify(zio, rr);
27887877fdebSMatt Macy 		unexpected_errors += n;
27897877fdebSMatt Macy 	}
27907877fdebSMatt Macy 
27917877fdebSMatt Macy 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
27927877fdebSMatt Macy 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
27937877fdebSMatt Macy 		/*
27947877fdebSMatt Macy 		 * Use the good data we have in hand to repair damaged children.
27957877fdebSMatt Macy 		 */
27967877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
27977877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
27987877fdebSMatt Macy 			vdev_t *vd = zio->io_vd;
27997877fdebSMatt Macy 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
28007877fdebSMatt Macy 
280116038816SMartin Matuska 			if (!rc->rc_allow_repair) {
280216038816SMartin Matuska 				continue;
280316038816SMartin Matuska 			} else if (!rc->rc_force_repair &&
280416038816SMartin Matuska 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
28057877fdebSMatt Macy 				continue;
28067877fdebSMatt Macy 			}
280787bf66d4SMartin Matuska 			/*
280887bf66d4SMartin Matuska 			 * We do not allow self healing for Direct I/O reads.
280987bf66d4SMartin Matuska 			 * See comment in vdev_raid_row_alloc().
281087bf66d4SMartin Matuska 			 */
281187bf66d4SMartin Matuska 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
28127877fdebSMatt Macy 
2813e716630dSMartin Matuska 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2814e716630dSMartin Matuska 			    "offset=%llx",
2815e716630dSMartin Matuska 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2816e716630dSMartin Matuska 
28177877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
28187877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
28197877fdebSMatt Macy 			    ZIO_TYPE_WRITE,
28207877fdebSMatt Macy 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
28217877fdebSMatt Macy 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
28227877fdebSMatt Macy 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
28237877fdebSMatt Macy 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
28247877fdebSMatt Macy 		}
28257877fdebSMatt Macy 	}
2826e716630dSMartin Matuska 
2827e716630dSMartin Matuska 	/*
2828e716630dSMartin Matuska 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
2829e716630dSMartin Matuska 	 * good data.  This ensures that if we've already copied this sector,
2830e716630dSMartin Matuska 	 * it will be corrected if it was damaged.  This writes more than is
2831e716630dSMartin Matuska 	 * necessary, but since expansion is paused during scrub/resilver, at
2832e716630dSMartin Matuska 	 * most a single row will have a shadow location.
2833e716630dSMartin Matuska 	 */
2834e716630dSMartin Matuska 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2835e716630dSMartin Matuska 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2836e716630dSMartin Matuska 		for (int c = 0; c < rr->rr_cols; c++) {
2837e716630dSMartin Matuska 			raidz_col_t *rc = &rr->rr_col[c];
2838e716630dSMartin Matuska 			vdev_t *vd = zio->io_vd;
2839e716630dSMartin Matuska 
2840e716630dSMartin Matuska 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2841e716630dSMartin Matuska 				continue;
2842e716630dSMartin Matuska 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2843e716630dSMartin Matuska 
2844e716630dSMartin Matuska 			/*
2845e716630dSMartin Matuska 			 * Note: We don't want to update the repair stats
2846e716630dSMartin Matuska 			 * because that would incorrectly indicate that there
2847e716630dSMartin Matuska 			 * was bad data to repair, which we aren't sure about.
2848e716630dSMartin Matuska 			 * By clearing the SCAN_THREAD flag, we prevent this
2849e716630dSMartin Matuska 			 * from happening, despite having the REPAIR flag set.
2850e716630dSMartin Matuska 			 * We need to set SELF_HEAL so that this i/o can't be
2851e716630dSMartin Matuska 			 * bypassed by zio_vdev_io_start().
2852e716630dSMartin Matuska 			 */
2853e716630dSMartin Matuska 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2854e716630dSMartin Matuska 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2855e716630dSMartin Matuska 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2856e716630dSMartin Matuska 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2857e716630dSMartin Matuska 			    NULL, NULL);
2858e716630dSMartin Matuska 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2859e716630dSMartin Matuska 			zio_nowait(cio);
2860e716630dSMartin Matuska 		}
2861e716630dSMartin Matuska 	}
28627877fdebSMatt Macy }
28637877fdebSMatt Macy 
28647877fdebSMatt Macy static void
28657877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm)
28667877fdebSMatt Macy {
28677877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
28687877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
28697877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
28707877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
28717877fdebSMatt Macy 			if (rc->rc_need_orig_restore) {
2872f9693befSMartin Matuska 				abd_copy(rc->rc_abd,
28737877fdebSMatt Macy 				    rc->rc_orig_data, rc->rc_size);
28747877fdebSMatt Macy 				rc->rc_need_orig_restore = B_FALSE;
28757877fdebSMatt Macy 			}
28767877fdebSMatt Macy 		}
28777877fdebSMatt Macy 	}
28787877fdebSMatt Macy }
28797877fdebSMatt Macy 
28807877fdebSMatt Macy /*
2881e716630dSMartin Matuska  * During raidz_reconstruct() for expanded VDEV, we need special consideration
2882e716630dSMartin Matuska  * failure simulations.  See note in raidz_reconstruct() on simulating failure
2883e716630dSMartin Matuska  * of a pre-expansion device.
2884e716630dSMartin Matuska  *
2885e716630dSMartin Matuska  * Treating logical child i as failed, return TRUE if the given column should
2886e716630dSMartin Matuska  * be treated as failed.  The idea of logical children allows us to imagine
2887e716630dSMartin Matuska  * that a disk silently failed before a RAIDZ expansion (reads from this disk
2888e716630dSMartin Matuska  * succeed but return the wrong data).  Since the expansion doesn't verify
2889e716630dSMartin Matuska  * checksums, the incorrect data will be moved to new locations spread among
2890e716630dSMartin Matuska  * the children (going diagonally across them).
2891e716630dSMartin Matuska  *
2892e716630dSMartin Matuska  * Higher "logical child failures" (values of `i`) indicate these
2893e716630dSMartin Matuska  * "pre-expansion failures".  The first physical_width values imagine that a
2894e716630dSMartin Matuska  * current child failed; the next physical_width-1 values imagine that a
2895e716630dSMartin Matuska  * child failed before the most recent expansion; the next physical_width-2
2896e716630dSMartin Matuska  * values imagine a child failed in the expansion before that, etc.
2897e716630dSMartin Matuska  */
2898e716630dSMartin Matuska static boolean_t
2899e716630dSMartin Matuska raidz_simulate_failure(int physical_width, int original_width, int ashift,
2900e716630dSMartin Matuska     int i, raidz_col_t *rc)
2901e716630dSMartin Matuska {
2902e716630dSMartin Matuska 	uint64_t sector_id =
2903e716630dSMartin Matuska 	    physical_width * (rc->rc_offset >> ashift) +
2904e716630dSMartin Matuska 	    rc->rc_devidx;
2905e716630dSMartin Matuska 
2906e716630dSMartin Matuska 	for (int w = physical_width; w >= original_width; w--) {
2907e716630dSMartin Matuska 		if (i < w) {
2908e716630dSMartin Matuska 			return (sector_id % w == i);
2909e716630dSMartin Matuska 		} else {
2910e716630dSMartin Matuska 			i -= w;
2911e716630dSMartin Matuska 		}
2912e716630dSMartin Matuska 	}
2913e716630dSMartin Matuska 	ASSERT(!"invalid logical child id");
2914e716630dSMartin Matuska 	return (B_FALSE);
2915e716630dSMartin Matuska }
2916e716630dSMartin Matuska 
2917e716630dSMartin Matuska /*
29187877fdebSMatt Macy  * returns EINVAL if reconstruction of the block will not be possible
29197877fdebSMatt Macy  * returns ECKSUM if this specific reconstruction failed
29207877fdebSMatt Macy  * returns 0 on successful reconstruction
29217877fdebSMatt Macy  */
29227877fdebSMatt Macy static int
29237877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
29247877fdebSMatt Macy {
29257877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2926e716630dSMartin Matuska 	int physical_width = zio->io_vd->vdev_children;
2927e716630dSMartin Matuska 	int original_width = (rm->rm_original_width != 0) ?
2928e716630dSMartin Matuska 	    rm->rm_original_width : physical_width;
2929e716630dSMartin Matuska 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2930e716630dSMartin Matuska 
2931e716630dSMartin Matuska 	if (dbgmsg) {
2932e716630dSMartin Matuska 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2933e716630dSMartin Matuska 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2934e716630dSMartin Matuska 	}
29357877fdebSMatt Macy 
29367877fdebSMatt Macy 	/* Reconstruct each row */
29377877fdebSMatt Macy 	for (int r = 0; r < rm->rm_nrows; r++) {
29387877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[r];
29397877fdebSMatt Macy 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
29407877fdebSMatt Macy 		int t = 0;
29417877fdebSMatt Macy 		int dead = 0;
29427877fdebSMatt Macy 		int dead_data = 0;
29437877fdebSMatt Macy 
2944e716630dSMartin Matuska 		if (dbgmsg)
2945e716630dSMartin Matuska 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2946e716630dSMartin Matuska 
29477877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
29487877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
29497877fdebSMatt Macy 			ASSERT0(rc->rc_need_orig_restore);
29507877fdebSMatt Macy 			if (rc->rc_error != 0) {
29517877fdebSMatt Macy 				dead++;
29527877fdebSMatt Macy 				if (c >= nparity)
29537877fdebSMatt Macy 					dead_data++;
29547877fdebSMatt Macy 				continue;
29557877fdebSMatt Macy 			}
29567877fdebSMatt Macy 			if (rc->rc_size == 0)
29577877fdebSMatt Macy 				continue;
29587877fdebSMatt Macy 			for (int lt = 0; lt < ntgts; lt++) {
2959e716630dSMartin Matuska 				if (raidz_simulate_failure(physical_width,
2960e716630dSMartin Matuska 				    original_width,
2961e716630dSMartin Matuska 				    zio->io_vd->vdev_top->vdev_ashift,
2962e716630dSMartin Matuska 				    ltgts[lt], rc)) {
29637877fdebSMatt Macy 					if (rc->rc_orig_data == NULL) {
29647877fdebSMatt Macy 						rc->rc_orig_data =
2965f9693befSMartin Matuska 						    abd_alloc_linear(
2966f9693befSMartin Matuska 						    rc->rc_size, B_TRUE);
2967f9693befSMartin Matuska 						abd_copy(rc->rc_orig_data,
29687877fdebSMatt Macy 						    rc->rc_abd, rc->rc_size);
29697877fdebSMatt Macy 					}
29707877fdebSMatt Macy 					rc->rc_need_orig_restore = B_TRUE;
29717877fdebSMatt Macy 
29727877fdebSMatt Macy 					dead++;
29737877fdebSMatt Macy 					if (c >= nparity)
29747877fdebSMatt Macy 						dead_data++;
2975e716630dSMartin Matuska 					/*
2976e716630dSMartin Matuska 					 * Note: simulating failure of a
2977e716630dSMartin Matuska 					 * pre-expansion device can hit more
2978e716630dSMartin Matuska 					 * than one column, in which case we
2979e716630dSMartin Matuska 					 * might try to simulate more failures
2980e716630dSMartin Matuska 					 * than can be reconstructed, which is
2981e716630dSMartin Matuska 					 * also more than the size of my_tgts.
2982e716630dSMartin Matuska 					 * This check prevents accessing past
2983e716630dSMartin Matuska 					 * the end of my_tgts.  The "dead >
2984e716630dSMartin Matuska 					 * nparity" check below will fail this
2985e716630dSMartin Matuska 					 * reconstruction attempt.
2986e716630dSMartin Matuska 					 */
2987e716630dSMartin Matuska 					if (t < VDEV_RAIDZ_MAXPARITY) {
29887877fdebSMatt Macy 						my_tgts[t++] = c;
2989e716630dSMartin Matuska 						if (dbgmsg) {
2990e716630dSMartin Matuska 							zfs_dbgmsg("simulating "
2991e716630dSMartin Matuska 							    "failure of col %u "
2992e716630dSMartin Matuska 							    "devidx %u", c,
2993e716630dSMartin Matuska 							    (int)rc->rc_devidx);
2994e716630dSMartin Matuska 						}
2995e716630dSMartin Matuska 					}
29967877fdebSMatt Macy 					break;
29977877fdebSMatt Macy 				}
29987877fdebSMatt Macy 			}
29997877fdebSMatt Macy 		}
30007877fdebSMatt Macy 		if (dead > nparity) {
30017877fdebSMatt Macy 			/* reconstruction not possible */
3002e716630dSMartin Matuska 			if (dbgmsg) {
3003e716630dSMartin Matuska 				zfs_dbgmsg("reconstruction not possible; "
3004e716630dSMartin Matuska 				    "too many failures");
3005e716630dSMartin Matuska 			}
30067877fdebSMatt Macy 			raidz_restore_orig_data(rm);
30077877fdebSMatt Macy 			return (EINVAL);
30087877fdebSMatt Macy 		}
30097877fdebSMatt Macy 		if (dead_data > 0)
3010f9693befSMartin Matuska 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
30117877fdebSMatt Macy 	}
30127877fdebSMatt Macy 
30137877fdebSMatt Macy 	/* Check for success */
30147877fdebSMatt Macy 	if (raidz_checksum_verify(zio) == 0) {
301587bf66d4SMartin Matuska 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
301687bf66d4SMartin Matuska 			return (0);
30177877fdebSMatt Macy 
30187877fdebSMatt Macy 		/* Reconstruction succeeded - report errors */
30197877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
30207877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
30217877fdebSMatt Macy 
30227877fdebSMatt Macy 			for (int c = 0; c < rr->rr_cols; c++) {
30237877fdebSMatt Macy 				raidz_col_t *rc = &rr->rr_col[c];
30247877fdebSMatt Macy 				if (rc->rc_need_orig_restore) {
30257877fdebSMatt Macy 					/*
30267877fdebSMatt Macy 					 * Note: if this is a parity column,
30277877fdebSMatt Macy 					 * we don't really know if it's wrong.
30287877fdebSMatt Macy 					 * We need to let
30297877fdebSMatt Macy 					 * vdev_raidz_io_done_verified() check
30307877fdebSMatt Macy 					 * it, and if we set rc_error, it will
30317877fdebSMatt Macy 					 * think that it is a "known" error
30327877fdebSMatt Macy 					 * that doesn't need to be checked
30337877fdebSMatt Macy 					 * or corrected.
30347877fdebSMatt Macy 					 */
30357877fdebSMatt Macy 					if (rc->rc_error == 0 &&
30367877fdebSMatt Macy 					    c >= rr->rr_firstdatacol) {
3037e92ffd9bSMartin Matuska 						vdev_raidz_checksum_error(zio,
3038f9693befSMartin Matuska 						    rc, rc->rc_orig_data);
30397877fdebSMatt Macy 						rc->rc_error =
30407877fdebSMatt Macy 						    SET_ERROR(ECKSUM);
30417877fdebSMatt Macy 					}
30427877fdebSMatt Macy 					rc->rc_need_orig_restore = B_FALSE;
30437877fdebSMatt Macy 				}
30447877fdebSMatt Macy 			}
30457877fdebSMatt Macy 
30467877fdebSMatt Macy 			vdev_raidz_io_done_verified(zio, rr);
30477877fdebSMatt Macy 		}
30487877fdebSMatt Macy 
30497877fdebSMatt Macy 		zio_checksum_verified(zio);
30507877fdebSMatt Macy 
3051e716630dSMartin Matuska 		if (dbgmsg) {
3052e716630dSMartin Matuska 			zfs_dbgmsg("reconstruction successful "
3053e716630dSMartin Matuska 			    "(checksum verified)");
3054e716630dSMartin Matuska 		}
30557877fdebSMatt Macy 		return (0);
30567877fdebSMatt Macy 	}
30577877fdebSMatt Macy 
30587877fdebSMatt Macy 	/* Reconstruction failed - restore original data */
30597877fdebSMatt Macy 	raidz_restore_orig_data(rm);
3060e716630dSMartin Matuska 	if (dbgmsg) {
3061e716630dSMartin Matuska 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3062e716630dSMartin Matuska 		    "failed", zio);
3063e716630dSMartin Matuska 	}
30647877fdebSMatt Macy 	return (ECKSUM);
30657877fdebSMatt Macy }
30667877fdebSMatt Macy 
30677877fdebSMatt Macy /*
30687877fdebSMatt Macy  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
30697877fdebSMatt Macy  * Note that the algorithm below is non-optimal because it doesn't take into
30707877fdebSMatt Macy  * account how reconstruction is actually performed. For example, with
30717877fdebSMatt Macy  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
30727877fdebSMatt Macy  * is targeted as invalid as if columns 1 and 4 are targeted since in both
30737877fdebSMatt Macy  * cases we'd only use parity information in column 0.
30747877fdebSMatt Macy  *
30757877fdebSMatt Macy  * The order that we find the various possible combinations of failed
30767877fdebSMatt Macy  * disks is dictated by these rules:
30777877fdebSMatt Macy  * - Examine each "slot" (the "i" in tgts[i])
3078e716630dSMartin Matuska  *   - Try to increment this slot (tgts[i] += 1)
30797877fdebSMatt Macy  *   - if we can't increment because it runs into the next slot,
30807877fdebSMatt Macy  *     reset our slot to the minimum, and examine the next slot
30817877fdebSMatt Macy  *
30827877fdebSMatt Macy  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
30837877fdebSMatt Macy  *  3 columns to reconstruct), we will generate the following sequence:
30847877fdebSMatt Macy  *
30857877fdebSMatt Macy  *  STATE        ACTION
30867877fdebSMatt Macy  *  0 1 2        special case: skip since these are all parity
30877877fdebSMatt Macy  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
30887877fdebSMatt Macy  *  0   2 3      first slot: increment to 1
30897877fdebSMatt Macy  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
30907877fdebSMatt Macy  *  0 1     4    first: reset to 0; middle: increment to 2
30917877fdebSMatt Macy  *  0   2   4    first: increment to 1
30927877fdebSMatt Macy  *    1 2   4    first: reset to 0; middle: increment to 3
30937877fdebSMatt Macy  *  0     3 4    first: increment to 1
30947877fdebSMatt Macy  *    1   3 4    first: increment to 2
30957877fdebSMatt Macy  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
30967877fdebSMatt Macy  *  0 1       5  first: reset to 0; middle: increment to 2
30977877fdebSMatt Macy  *  0   2     5  first: increment to 1
30987877fdebSMatt Macy  *    1 2     5  first: reset to 0; middle: increment to 3
30997877fdebSMatt Macy  *  0     3   5  first: increment to 1
31007877fdebSMatt Macy  *    1   3   5  first: increment to 2
31017877fdebSMatt Macy  *      2 3   5  first: reset to 0; middle: increment to 4
31027877fdebSMatt Macy  *  0       4 5  first: increment to 1
31037877fdebSMatt Macy  *    1     4 5  first: increment to 2
31047877fdebSMatt Macy  *      2   4 5  first: increment to 3
31057877fdebSMatt Macy  *        3 4 5  done
31067877fdebSMatt Macy  *
310716038816SMartin Matuska  * This strategy works for dRAID but is less efficient when there are a large
31087877fdebSMatt Macy  * number of child vdevs and therefore permutations to check. Furthermore,
3109e716630dSMartin Matuska  * since the raidz_map_t rows likely do not overlap, reconstruction would be
31107877fdebSMatt Macy  * possible as long as there are no more than nparity data errors per row.
31117877fdebSMatt Macy  * These additional permutations are not currently checked but could be as
31127877fdebSMatt Macy  * a future improvement.
3113e716630dSMartin Matuska  *
3114e716630dSMartin Matuska  * Returns 0 on success, ECKSUM on failure.
31157877fdebSMatt Macy  */
31167877fdebSMatt Macy static int
31177877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio)
31187877fdebSMatt Macy {
31197877fdebSMatt Macy 	int nparity = vdev_get_nparity(zio->io_vd);
31207877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
3121e716630dSMartin Matuska 	int physical_width = zio->io_vd->vdev_children;
3122e716630dSMartin Matuska 	int original_width = (rm->rm_original_width != 0) ?
3123e716630dSMartin Matuska 	    rm->rm_original_width : physical_width;
31247877fdebSMatt Macy 
31257877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
31267877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
31277877fdebSMatt Macy 		int total_errors = 0;
31287877fdebSMatt Macy 
31297877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
31307877fdebSMatt Macy 			if (rr->rr_col[c].rc_error)
31317877fdebSMatt Macy 				total_errors++;
31327877fdebSMatt Macy 		}
31337877fdebSMatt Macy 
31347877fdebSMatt Macy 		if (total_errors > nparity)
31357877fdebSMatt Macy 			return (vdev_raidz_worst_error(rr));
31367877fdebSMatt Macy 	}
31377877fdebSMatt Macy 
31387877fdebSMatt Macy 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
31397877fdebSMatt Macy 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
31407877fdebSMatt Macy 		int *ltgts = &tstore[1]; /* value is logical child ID */
31417877fdebSMatt Macy 
3142e716630dSMartin Matuska 
3143e716630dSMartin Matuska 		/*
3144e716630dSMartin Matuska 		 * Determine number of logical children, n.  See comment
3145e716630dSMartin Matuska 		 * above raidz_simulate_failure().
3146e716630dSMartin Matuska 		 */
3147e716630dSMartin Matuska 		int n = 0;
3148e716630dSMartin Matuska 		for (int w = physical_width;
3149e716630dSMartin Matuska 		    w >= original_width; w--) {
3150e716630dSMartin Matuska 			n += w;
3151e716630dSMartin Matuska 		}
31527877fdebSMatt Macy 
31537877fdebSMatt Macy 		ASSERT3U(num_failures, <=, nparity);
31547877fdebSMatt Macy 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
31557877fdebSMatt Macy 
31567877fdebSMatt Macy 		/* Handle corner cases in combrec logic */
31577877fdebSMatt Macy 		ltgts[-1] = -1;
31587877fdebSMatt Macy 		for (int i = 0; i < num_failures; i++) {
31597877fdebSMatt Macy 			ltgts[i] = i;
31607877fdebSMatt Macy 		}
31617877fdebSMatt Macy 		ltgts[num_failures] = n;
31627877fdebSMatt Macy 
31637877fdebSMatt Macy 		for (;;) {
31647877fdebSMatt Macy 			int err = raidz_reconstruct(zio, ltgts, num_failures,
31657877fdebSMatt Macy 			    nparity);
31667877fdebSMatt Macy 			if (err == EINVAL) {
31677877fdebSMatt Macy 				/*
31687877fdebSMatt Macy 				 * Reconstruction not possible with this #
31697877fdebSMatt Macy 				 * failures; try more failures.
31707877fdebSMatt Macy 				 */
31717877fdebSMatt Macy 				break;
31727877fdebSMatt Macy 			} else if (err == 0)
31737877fdebSMatt Macy 				return (0);
31747877fdebSMatt Macy 
31757877fdebSMatt Macy 			/* Compute next targets to try */
31767877fdebSMatt Macy 			for (int t = 0; ; t++) {
31777877fdebSMatt Macy 				ASSERT3U(t, <, num_failures);
31787877fdebSMatt Macy 				ltgts[t]++;
31797877fdebSMatt Macy 				if (ltgts[t] == n) {
31807877fdebSMatt Macy 					/* try more failures */
31817877fdebSMatt Macy 					ASSERT3U(t, ==, num_failures - 1);
3182e716630dSMartin Matuska 					if (zfs_flags &
3183e716630dSMartin Matuska 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3184e716630dSMartin Matuska 						zfs_dbgmsg("reconstruction "
3185e716630dSMartin Matuska 						    "failed for num_failures="
3186e716630dSMartin Matuska 						    "%u; tried all "
3187e716630dSMartin Matuska 						    "combinations",
3188e716630dSMartin Matuska 						    num_failures);
3189e716630dSMartin Matuska 					}
31907877fdebSMatt Macy 					break;
31917877fdebSMatt Macy 				}
31927877fdebSMatt Macy 
31937877fdebSMatt Macy 				ASSERT3U(ltgts[t], <, n);
31947877fdebSMatt Macy 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
31957877fdebSMatt Macy 
31967877fdebSMatt Macy 				/*
31977877fdebSMatt Macy 				 * If that spot is available, we're done here.
31987877fdebSMatt Macy 				 * Try the next combination.
31997877fdebSMatt Macy 				 */
32007877fdebSMatt Macy 				if (ltgts[t] != ltgts[t + 1])
3201e716630dSMartin Matuska 					break; // found next combination
32027877fdebSMatt Macy 
32037877fdebSMatt Macy 				/*
32047877fdebSMatt Macy 				 * Otherwise, reset this tgt to the minimum,
32057877fdebSMatt Macy 				 * and move on to the next tgt.
32067877fdebSMatt Macy 				 */
32077877fdebSMatt Macy 				ltgts[t] = ltgts[t - 1] + 1;
32087877fdebSMatt Macy 				ASSERT3U(ltgts[t], ==, t);
32097877fdebSMatt Macy 			}
32107877fdebSMatt Macy 
32117877fdebSMatt Macy 			/* Increase the number of failures and keep trying. */
32127877fdebSMatt Macy 			if (ltgts[num_failures - 1] == n)
32137877fdebSMatt Macy 				break;
32147877fdebSMatt Macy 		}
32157877fdebSMatt Macy 	}
3216e716630dSMartin Matuska 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3217e716630dSMartin Matuska 		zfs_dbgmsg("reconstruction failed for all num_failures");
32187877fdebSMatt Macy 	return (ECKSUM);
32197877fdebSMatt Macy }
32207877fdebSMatt Macy 
32217877fdebSMatt Macy void
32227877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
32237877fdebSMatt Macy {
32247877fdebSMatt Macy 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
32257877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[row];
32267877fdebSMatt Macy 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
32277877fdebSMatt Macy 	}
32287877fdebSMatt Macy }
32297877fdebSMatt Macy 
32307877fdebSMatt Macy /*
32317877fdebSMatt Macy  * Complete a write IO operation on a RAIDZ VDev
32327877fdebSMatt Macy  *
32337877fdebSMatt Macy  * Outline:
32347877fdebSMatt Macy  *   1. Check for errors on the child IOs.
32357877fdebSMatt Macy  *   2. Return, setting an error code if too few child VDevs were written
32367877fdebSMatt Macy  *      to reconstruct the data later.  Note that partial writes are
32377877fdebSMatt Macy  *      considered successful if they can be reconstructed at all.
32387877fdebSMatt Macy  */
32397877fdebSMatt Macy static void
32407877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
32417877fdebSMatt Macy {
3242e716630dSMartin Matuska 	int normal_errors = 0;
3243e716630dSMartin Matuska 	int shadow_errors = 0;
32447877fdebSMatt Macy 
32457877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
32467877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
32477877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
32487877fdebSMatt Macy 
32497877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
32507877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
32517877fdebSMatt Macy 
3252e716630dSMartin Matuska 		if (rc->rc_error != 0) {
32537877fdebSMatt Macy 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3254e716630dSMartin Matuska 			normal_errors++;
3255e716630dSMartin Matuska 		}
3256e716630dSMartin Matuska 		if (rc->rc_shadow_error != 0) {
3257e716630dSMartin Matuska 			ASSERT(rc->rc_shadow_error != ECKSUM);
3258e716630dSMartin Matuska 			shadow_errors++;
32597877fdebSMatt Macy 		}
32607877fdebSMatt Macy 	}
32617877fdebSMatt Macy 
32627877fdebSMatt Macy 	/*
32637877fdebSMatt Macy 	 * Treat partial writes as a success. If we couldn't write enough
3264e716630dSMartin Matuska 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3265e716630dSMartin Matuska 	 * enough.  Note that in the case of a shadow write (during raidz
3266e716630dSMartin Matuska 	 * expansion), depending on if we crash, either the normal (old) or
3267e716630dSMartin Matuska 	 * shadow (new) location may become the "real" version of the block,
3268e716630dSMartin Matuska 	 * so both locations must have sufficient redundancy.
3269eda14cbcSMatt Macy 	 *
3270eda14cbcSMatt Macy 	 * Now that we support write reallocation, it would be better
3271eda14cbcSMatt Macy 	 * to treat partial failure as real failure unless there are
3272eda14cbcSMatt Macy 	 * no non-degraded top-level vdevs left, and not update DTLs
3273eda14cbcSMatt Macy 	 * if we intend to reallocate.
3274eda14cbcSMatt Macy 	 */
3275e716630dSMartin Matuska 	if (normal_errors > rr->rr_firstdatacol ||
3276e716630dSMartin Matuska 	    shadow_errors > rr->rr_firstdatacol) {
32777877fdebSMatt Macy 		zio->io_error = zio_worst_error(zio->io_error,
32787877fdebSMatt Macy 		    vdev_raidz_worst_error(rr));
32797877fdebSMatt Macy 	}
3280eda14cbcSMatt Macy }
3281eda14cbcSMatt Macy 
3282f9693befSMartin Matuska static void
32837877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
32847877fdebSMatt Macy     raidz_row_t *rr)
32857877fdebSMatt Macy {
32867877fdebSMatt Macy 	int parity_errors = 0;
32877877fdebSMatt Macy 	int parity_untried = 0;
32887877fdebSMatt Macy 	int data_errors = 0;
32897877fdebSMatt Macy 	int total_errors = 0;
32907877fdebSMatt Macy 
32917877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
32927877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
32937877fdebSMatt Macy 
32947877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
32957877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
32967877fdebSMatt Macy 
3297a0b956f5SMartin Matuska 		/*
3298a0b956f5SMartin Matuska 		 * If scrubbing and a replacing/sparing child vdev determined
3299a0b956f5SMartin Matuska 		 * that not all of its children have an identical copy of the
3300a0b956f5SMartin Matuska 		 * data, then clear the error so the column is treated like
3301a0b956f5SMartin Matuska 		 * any other read and force a repair to correct the damage.
3302a0b956f5SMartin Matuska 		 */
3303a0b956f5SMartin Matuska 		if (rc->rc_error == ECKSUM) {
3304a0b956f5SMartin Matuska 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3305a0b956f5SMartin Matuska 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3306a0b956f5SMartin Matuska 			rc->rc_force_repair = 1;
3307a0b956f5SMartin Matuska 			rc->rc_error = 0;
3308a0b956f5SMartin Matuska 		}
33097877fdebSMatt Macy 
3310a0b956f5SMartin Matuska 		if (rc->rc_error) {
33117877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
33127877fdebSMatt Macy 				parity_errors++;
33137877fdebSMatt Macy 			else
33147877fdebSMatt Macy 				data_errors++;
33157877fdebSMatt Macy 
33167877fdebSMatt Macy 			total_errors++;
33177877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
33187877fdebSMatt Macy 			parity_untried++;
33197877fdebSMatt Macy 		}
33207877fdebSMatt Macy 	}
3321eda14cbcSMatt Macy 
3322eda14cbcSMatt Macy 	/*
33237877fdebSMatt Macy 	 * If there were data errors and the number of errors we saw was
33247877fdebSMatt Macy 	 * correctable -- less than or equal to the number of parity disks read
33257877fdebSMatt Macy 	 * -- reconstruct based on the missing data.
3326eda14cbcSMatt Macy 	 */
33277877fdebSMatt Macy 	if (data_errors != 0 &&
33287877fdebSMatt Macy 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3329eda14cbcSMatt Macy 		/*
3330eda14cbcSMatt Macy 		 * We either attempt to read all the parity columns or
3331eda14cbcSMatt Macy 		 * none of them. If we didn't try to read parity, we
3332eda14cbcSMatt Macy 		 * wouldn't be here in the correctable case. There must
3333eda14cbcSMatt Macy 		 * also have been fewer parity errors than parity
3334eda14cbcSMatt Macy 		 * columns or, again, we wouldn't be in this code path.
3335eda14cbcSMatt Macy 		 */
3336eda14cbcSMatt Macy 		ASSERT(parity_untried == 0);
33377877fdebSMatt Macy 		ASSERT(parity_errors < rr->rr_firstdatacol);
3338eda14cbcSMatt Macy 
3339eda14cbcSMatt Macy 		/*
3340eda14cbcSMatt Macy 		 * Identify the data columns that reported an error.
3341eda14cbcSMatt Macy 		 */
33427877fdebSMatt Macy 		int n = 0;
33437877fdebSMatt Macy 		int tgts[VDEV_RAIDZ_MAXPARITY];
33447877fdebSMatt Macy 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
33457877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
3346eda14cbcSMatt Macy 			if (rc->rc_error != 0) {
3347eda14cbcSMatt Macy 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3348eda14cbcSMatt Macy 				tgts[n++] = c;
3349eda14cbcSMatt Macy 			}
3350eda14cbcSMatt Macy 		}
3351eda14cbcSMatt Macy 
33527877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol >= n);
3353eda14cbcSMatt Macy 
3354f9693befSMartin Matuska 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3355eda14cbcSMatt Macy 	}
3356eda14cbcSMatt Macy }
3357eda14cbcSMatt Macy 
3358eda14cbcSMatt Macy /*
33597877fdebSMatt Macy  * Return the number of reads issued.
3360eda14cbcSMatt Macy  */
33617877fdebSMatt Macy static int
33627877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
33637877fdebSMatt Macy {
33647877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
33657877fdebSMatt Macy 	int nread = 0;
3366eda14cbcSMatt Macy 
33677877fdebSMatt Macy 	rr->rr_missingdata = 0;
33687877fdebSMatt Macy 	rr->rr_missingparity = 0;
33697877fdebSMatt Macy 
33707877fdebSMatt Macy 	/*
33717877fdebSMatt Macy 	 * If this rows contains empty sectors which are not required
33727877fdebSMatt Macy 	 * for a normal read then allocate an ABD for them now so they
33737877fdebSMatt Macy 	 * may be read, verified, and any needed repairs performed.
33747877fdebSMatt Macy 	 */
3375e716630dSMartin Matuska 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
33767877fdebSMatt Macy 		vdev_draid_map_alloc_empty(zio, rr);
33777877fdebSMatt Macy 
33787877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
33797877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
33807877fdebSMatt Macy 		if (rc->rc_tried || rc->rc_size == 0)
3381eda14cbcSMatt Macy 			continue;
3382eda14cbcSMatt Macy 
3383eda14cbcSMatt Macy 		zio_nowait(zio_vdev_child_io(zio, NULL,
3384eda14cbcSMatt Macy 		    vd->vdev_child[rc->rc_devidx],
3385eda14cbcSMatt Macy 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3386eda14cbcSMatt Macy 		    zio->io_type, zio->io_priority, 0,
3387eda14cbcSMatt Macy 		    vdev_raidz_child_done, rc));
33887877fdebSMatt Macy 		nread++;
33897877fdebSMatt Macy 	}
33907877fdebSMatt Macy 	return (nread);
3391eda14cbcSMatt Macy }
3392eda14cbcSMatt Macy 
3393eda14cbcSMatt Macy /*
33947877fdebSMatt Macy  * We're here because either there were too many errors to even attempt
33957877fdebSMatt Macy  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
33967877fdebSMatt Macy  * failed. In either case, there is enough bad data to prevent reconstruction.
33977877fdebSMatt Macy  * Start checksum ereports for all children which haven't failed.
3398eda14cbcSMatt Macy  */
33997877fdebSMatt Macy static void
34007877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio)
34017877fdebSMatt Macy {
34027877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
3403eda14cbcSMatt Macy 
34047877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
34057877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
3406eda14cbcSMatt Macy 
34077877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
34087877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
34097877fdebSMatt Macy 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
34107877fdebSMatt Macy 
34112c48331dSMatt Macy 			if (rc->rc_error != 0)
34122c48331dSMatt Macy 				continue;
34132c48331dSMatt Macy 
3414eda14cbcSMatt Macy 			zio_bad_cksum_t zbc;
3415eda14cbcSMatt Macy 			zbc.zbc_has_cksum = 0;
34162c48331dSMatt Macy 			zbc.zbc_injected = rm->rm_ecksuminjected;
3417eda14cbcSMatt Macy 			mutex_enter(&cvd->vdev_stat_lock);
3418eda14cbcSMatt Macy 			cvd->vdev_stat.vs_checksum_errors++;
3419eda14cbcSMatt Macy 			mutex_exit(&cvd->vdev_stat_lock);
3420bb2d13b6SMartin Matuska 			(void) zfs_ereport_start_checksum(zio->io_spa,
3421bb2d13b6SMartin Matuska 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3422bb2d13b6SMartin Matuska 			    rc->rc_size, &zbc);
3423eda14cbcSMatt Macy 		}
3424eda14cbcSMatt Macy 	}
3425eda14cbcSMatt Macy }
3426eda14cbcSMatt Macy 
34277877fdebSMatt Macy void
34287877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio)
34297877fdebSMatt Macy {
34307877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
34317877fdebSMatt Macy 
3432e716630dSMartin Matuska 	ASSERT(zio->io_bp != NULL);
34337877fdebSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
34347877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
34357877fdebSMatt Macy 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
34367877fdebSMatt Macy 		}
34377877fdebSMatt Macy 	} else {
3438e716630dSMartin Matuska 		if (rm->rm_phys_col) {
3439e716630dSMartin Matuska 			/*
3440e716630dSMartin Matuska 			 * This is an aggregated read.  Copy the data and status
3441e716630dSMartin Matuska 			 * from the aggregate abd's to the individual rows.
3442e716630dSMartin Matuska 			 */
3443e716630dSMartin Matuska 			for (int i = 0; i < rm->rm_nrows; i++) {
3444e716630dSMartin Matuska 				raidz_row_t *rr = rm->rm_row[i];
3445e716630dSMartin Matuska 
3446e716630dSMartin Matuska 				for (int c = 0; c < rr->rr_cols; c++) {
3447e716630dSMartin Matuska 					raidz_col_t *rc = &rr->rr_col[c];
3448e716630dSMartin Matuska 					if (rc->rc_tried || rc->rc_size == 0)
3449e716630dSMartin Matuska 						continue;
3450e716630dSMartin Matuska 
3451e716630dSMartin Matuska 					raidz_col_t *prc =
3452e716630dSMartin Matuska 					    &rm->rm_phys_col[rc->rc_devidx];
3453e716630dSMartin Matuska 					rc->rc_error = prc->rc_error;
3454e716630dSMartin Matuska 					rc->rc_tried = prc->rc_tried;
3455e716630dSMartin Matuska 					rc->rc_skipped = prc->rc_skipped;
3456e716630dSMartin Matuska 					if (c >= rr->rr_firstdatacol) {
3457e716630dSMartin Matuska 						/*
3458e716630dSMartin Matuska 						 * Note: this is slightly faster
3459e716630dSMartin Matuska 						 * than using abd_copy_off().
3460e716630dSMartin Matuska 						 */
3461e716630dSMartin Matuska 						char *physbuf = abd_to_buf(
3462e716630dSMartin Matuska 						    prc->rc_abd);
3463e716630dSMartin Matuska 						void *physloc = physbuf +
3464e716630dSMartin Matuska 						    rc->rc_offset -
3465e716630dSMartin Matuska 						    prc->rc_offset;
3466e716630dSMartin Matuska 
3467e716630dSMartin Matuska 						abd_copy_from_buf(rc->rc_abd,
3468e716630dSMartin Matuska 						    physloc, rc->rc_size);
3469e716630dSMartin Matuska 					}
3470e716630dSMartin Matuska 				}
3471e716630dSMartin Matuska 			}
3472e716630dSMartin Matuska 		}
3473e716630dSMartin Matuska 
34747877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
34757877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
34767877fdebSMatt Macy 			vdev_raidz_io_done_reconstruct_known_missing(zio,
34777877fdebSMatt Macy 			    rm, rr);
34787877fdebSMatt Macy 		}
34797877fdebSMatt Macy 
34807877fdebSMatt Macy 		if (raidz_checksum_verify(zio) == 0) {
348187bf66d4SMartin Matuska 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
348287bf66d4SMartin Matuska 				goto done;
348387bf66d4SMartin Matuska 
34847877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
34857877fdebSMatt Macy 				raidz_row_t *rr = rm->rm_row[i];
34867877fdebSMatt Macy 				vdev_raidz_io_done_verified(zio, rr);
34877877fdebSMatt Macy 			}
3488eda14cbcSMatt Macy 			zio_checksum_verified(zio);
34897877fdebSMatt Macy 		} else {
3490eda14cbcSMatt Macy 			/*
34917877fdebSMatt Macy 			 * A sequential resilver has no checksum which makes
34927877fdebSMatt Macy 			 * combinatoral reconstruction impossible. This code
34937877fdebSMatt Macy 			 * path is unreachable since raidz_checksum_verify()
34947877fdebSMatt Macy 			 * has no checksum to verify and must succeed.
3495eda14cbcSMatt Macy 			 */
34967877fdebSMatt Macy 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3497eda14cbcSMatt Macy 
34987877fdebSMatt Macy 			/*
34997877fdebSMatt Macy 			 * This isn't a typical situation -- either we got a
35007877fdebSMatt Macy 			 * read error or a child silently returned bad data.
35017877fdebSMatt Macy 			 * Read every block so we can try again with as much
35027877fdebSMatt Macy 			 * data and parity as we can track down. If we've
35037877fdebSMatt Macy 			 * already been through once before, all children will
35047877fdebSMatt Macy 			 * be marked as tried so we'll proceed to combinatorial
35057877fdebSMatt Macy 			 * reconstruction.
35067877fdebSMatt Macy 			 */
35077877fdebSMatt Macy 			int nread = 0;
35087877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
35097877fdebSMatt Macy 				nread += vdev_raidz_read_all(zio,
35107877fdebSMatt Macy 				    rm->rm_row[i]);
35117877fdebSMatt Macy 			}
35127877fdebSMatt Macy 			if (nread != 0) {
35137877fdebSMatt Macy 				/*
35147877fdebSMatt Macy 				 * Normally our stage is VDEV_IO_DONE, but if
35157877fdebSMatt Macy 				 * we've already called redone(), it will have
35167877fdebSMatt Macy 				 * changed to VDEV_IO_START, in which case we
35177877fdebSMatt Macy 				 * don't want to call redone() again.
35187877fdebSMatt Macy 				 */
35197877fdebSMatt Macy 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
35207877fdebSMatt Macy 					zio_vdev_io_redone(zio);
35217877fdebSMatt Macy 				return;
35227877fdebSMatt Macy 			}
3523e716630dSMartin Matuska 			/*
3524e716630dSMartin Matuska 			 * It would be too expensive to try every possible
3525e716630dSMartin Matuska 			 * combination of failed sectors in every row, so
3526e716630dSMartin Matuska 			 * instead we try every combination of failed current or
3527e716630dSMartin Matuska 			 * past physical disk. This means that if the incorrect
3528e716630dSMartin Matuska 			 * sectors were all on Nparity disks at any point in the
3529e716630dSMartin Matuska 			 * past, we will find the correct data.  The only known
3530e716630dSMartin Matuska 			 * case where this is less durable than a non-expanded
3531e716630dSMartin Matuska 			 * RAIDZ, is if we have a silent failure during
3532e716630dSMartin Matuska 			 * expansion.  In that case, one block could be
3533e716630dSMartin Matuska 			 * partially in the old format and partially in the
3534e716630dSMartin Matuska 			 * new format, so we'd lost some sectors from the old
3535e716630dSMartin Matuska 			 * format and some from the new format.
3536e716630dSMartin Matuska 			 *
3537e716630dSMartin Matuska 			 * e.g. logical_width=4 physical_width=6
3538e716630dSMartin Matuska 			 * the 15 (6+5+4) possible failed disks are:
3539e716630dSMartin Matuska 			 * width=6 child=0
3540e716630dSMartin Matuska 			 * width=6 child=1
3541e716630dSMartin Matuska 			 * width=6 child=2
3542e716630dSMartin Matuska 			 * width=6 child=3
3543e716630dSMartin Matuska 			 * width=6 child=4
3544e716630dSMartin Matuska 			 * width=6 child=5
3545e716630dSMartin Matuska 			 * width=5 child=0
3546e716630dSMartin Matuska 			 * width=5 child=1
3547e716630dSMartin Matuska 			 * width=5 child=2
3548e716630dSMartin Matuska 			 * width=5 child=3
3549e716630dSMartin Matuska 			 * width=5 child=4
3550e716630dSMartin Matuska 			 * width=4 child=0
3551e716630dSMartin Matuska 			 * width=4 child=1
3552e716630dSMartin Matuska 			 * width=4 child=2
3553e716630dSMartin Matuska 			 * width=4 child=3
3554e716630dSMartin Matuska 			 * And we will try every combination of Nparity of these
3555e716630dSMartin Matuska 			 * failing.
3556e716630dSMartin Matuska 			 *
3557e716630dSMartin Matuska 			 * As a first pass, we can generate every combo,
3558e716630dSMartin Matuska 			 * and try reconstructing, ignoring any known
3559e716630dSMartin Matuska 			 * failures.  If any row has too many known + simulated
3560e716630dSMartin Matuska 			 * failures, then we bail on reconstructing with this
3561e716630dSMartin Matuska 			 * number of simulated failures.  As an improvement,
3562e716630dSMartin Matuska 			 * we could detect the number of whole known failures
3563e716630dSMartin Matuska 			 * (i.e. we have known failures on these disks for
3564e716630dSMartin Matuska 			 * every row; the disks never succeeded), and
3565e716630dSMartin Matuska 			 * subtract that from the max # failures to simulate.
3566e716630dSMartin Matuska 			 * We could go even further like the current
3567e716630dSMartin Matuska 			 * combrec code, but that doesn't seem like it
3568e716630dSMartin Matuska 			 * gains us very much.  If we simulate a failure
3569e716630dSMartin Matuska 			 * that is also a known failure, that's fine.
3570e716630dSMartin Matuska 			 */
35717877fdebSMatt Macy 			zio->io_error = vdev_raidz_combrec(zio);
35727877fdebSMatt Macy 			if (zio->io_error == ECKSUM &&
35737877fdebSMatt Macy 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
35747877fdebSMatt Macy 				vdev_raidz_io_done_unrecoverable(zio);
35757877fdebSMatt Macy 			}
3576eda14cbcSMatt Macy 		}
3577eda14cbcSMatt Macy 	}
357887bf66d4SMartin Matuska done:
3579e716630dSMartin Matuska 	if (rm->rm_lr != NULL) {
3580e716630dSMartin Matuska 		zfs_rangelock_exit(rm->rm_lr);
3581e716630dSMartin Matuska 		rm->rm_lr = NULL;
3582e716630dSMartin Matuska 	}
3583eda14cbcSMatt Macy }
3584eda14cbcSMatt Macy 
3585eda14cbcSMatt Macy static void
3586eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3587eda14cbcSMatt Macy {
35887877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
35897877fdebSMatt Macy 	if (faulted > vdrz->vd_nparity)
3590eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3591eda14cbcSMatt Macy 		    VDEV_AUX_NO_REPLICAS);
3592eda14cbcSMatt Macy 	else if (degraded + faulted != 0)
3593eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3594eda14cbcSMatt Macy 	else
3595eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3596eda14cbcSMatt Macy }
3597eda14cbcSMatt Macy 
3598eda14cbcSMatt Macy /*
3599eda14cbcSMatt Macy  * Determine if any portion of the provided block resides on a child vdev
3600eda14cbcSMatt Macy  * with a dirty DTL and therefore needs to be resilvered.  The function
3601eda14cbcSMatt Macy  * assumes that at least one DTL is dirty which implies that full stripe
3602eda14cbcSMatt Macy  * width blocks must be resilvered.
3603eda14cbcSMatt Macy  */
3604eda14cbcSMatt Macy static boolean_t
36057877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
36067877fdebSMatt Macy     uint64_t phys_birth)
3607eda14cbcSMatt Macy {
36087877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
3609e716630dSMartin Matuska 
3610e716630dSMartin Matuska 	/*
3611e716630dSMartin Matuska 	 * If we're in the middle of a RAIDZ expansion, this block may be in
3612e716630dSMartin Matuska 	 * the old and/or new location.  For simplicity, always resilver it.
3613e716630dSMartin Matuska 	 */
3614e716630dSMartin Matuska 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3615e716630dSMartin Matuska 		return (B_TRUE);
3616e716630dSMartin Matuska 
3617eda14cbcSMatt Macy 	uint64_t dcols = vd->vdev_children;
36187877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
3619eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
3620eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
36217877fdebSMatt Macy 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3622eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
3623eda14cbcSMatt Macy 	uint64_t s = ((psize - 1) >> ashift) + 1;
3624eda14cbcSMatt Macy 	/* The first column for this stripe. */
3625eda14cbcSMatt Macy 	uint64_t f = b % dcols;
3626eda14cbcSMatt Macy 
36277877fdebSMatt Macy 	/* Unreachable by sequential resilver. */
36287877fdebSMatt Macy 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
36297877fdebSMatt Macy 
36307877fdebSMatt Macy 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
36317877fdebSMatt Macy 		return (B_FALSE);
36327877fdebSMatt Macy 
3633eda14cbcSMatt Macy 	if (s + nparity >= dcols)
3634eda14cbcSMatt Macy 		return (B_TRUE);
3635eda14cbcSMatt Macy 
3636eda14cbcSMatt Macy 	for (uint64_t c = 0; c < s + nparity; c++) {
3637eda14cbcSMatt Macy 		uint64_t devidx = (f + c) % dcols;
3638eda14cbcSMatt Macy 		vdev_t *cvd = vd->vdev_child[devidx];
3639eda14cbcSMatt Macy 
3640eda14cbcSMatt Macy 		/*
3641eda14cbcSMatt Macy 		 * dsl_scan_need_resilver() already checked vd with
3642eda14cbcSMatt Macy 		 * vdev_dtl_contains(). So here just check cvd with
3643eda14cbcSMatt Macy 		 * vdev_dtl_empty(), cheaper and a good approximation.
3644eda14cbcSMatt Macy 		 */
3645eda14cbcSMatt Macy 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3646eda14cbcSMatt Macy 			return (B_TRUE);
3647eda14cbcSMatt Macy 	}
3648eda14cbcSMatt Macy 
3649eda14cbcSMatt Macy 	return (B_FALSE);
3650eda14cbcSMatt Macy }
3651eda14cbcSMatt Macy 
3652eda14cbcSMatt Macy static void
36537877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
36547877fdebSMatt Macy     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3655eda14cbcSMatt Macy {
3656e92ffd9bSMartin Matuska 	(void) remain_rs;
3657e92ffd9bSMartin Matuska 
3658eda14cbcSMatt Macy 	vdev_t *raidvd = cvd->vdev_parent;
3659eda14cbcSMatt Macy 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3660eda14cbcSMatt Macy 
3661e716630dSMartin Matuska 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3662e716630dSMartin Matuska 
3663e716630dSMartin Matuska 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3664e716630dSMartin Matuska 		/*
3665e716630dSMartin Matuska 		 * We're in the middle of expansion, in which case the
3666e716630dSMartin Matuska 		 * translation is in flux.  Any answer we give may be wrong
3667e716630dSMartin Matuska 		 * by the time we return, so it isn't safe for the caller to
3668e716630dSMartin Matuska 		 * act on it.  Therefore we say that this range isn't present
3669e716630dSMartin Matuska 		 * on any children.  The only consumers of this are "zpool
3670e716630dSMartin Matuska 		 * initialize" and trimming, both of which are "best effort"
3671e716630dSMartin Matuska 		 * anyway.
3672e716630dSMartin Matuska 		 */
3673e716630dSMartin Matuska 		physical_rs->rs_start = physical_rs->rs_end = 0;
3674e716630dSMartin Matuska 		remain_rs->rs_start = remain_rs->rs_end = 0;
3675e716630dSMartin Matuska 		return;
3676e716630dSMartin Matuska 	}
3677e716630dSMartin Matuska 
3678e716630dSMartin Matuska 	uint64_t width = vdrz->vd_physical_width;
3679eda14cbcSMatt Macy 	uint64_t tgt_col = cvd->vdev_id;
3680eda14cbcSMatt Macy 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3681eda14cbcSMatt Macy 
3682eda14cbcSMatt Macy 	/* make sure the offsets are block-aligned */
36837877fdebSMatt Macy 	ASSERT0(logical_rs->rs_start % (1 << ashift));
36847877fdebSMatt Macy 	ASSERT0(logical_rs->rs_end % (1 << ashift));
36857877fdebSMatt Macy 	uint64_t b_start = logical_rs->rs_start >> ashift;
36867877fdebSMatt Macy 	uint64_t b_end = logical_rs->rs_end >> ashift;
3687eda14cbcSMatt Macy 
3688eda14cbcSMatt Macy 	uint64_t start_row = 0;
3689eda14cbcSMatt Macy 	if (b_start > tgt_col) /* avoid underflow */
3690eda14cbcSMatt Macy 		start_row = ((b_start - tgt_col - 1) / width) + 1;
3691eda14cbcSMatt Macy 
3692eda14cbcSMatt Macy 	uint64_t end_row = 0;
3693eda14cbcSMatt Macy 	if (b_end > tgt_col)
3694eda14cbcSMatt Macy 		end_row = ((b_end - tgt_col - 1) / width) + 1;
3695eda14cbcSMatt Macy 
36967877fdebSMatt Macy 	physical_rs->rs_start = start_row << ashift;
36977877fdebSMatt Macy 	physical_rs->rs_end = end_row << ashift;
3698eda14cbcSMatt Macy 
36997877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
37007877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
37017877fdebSMatt Macy 	    logical_rs->rs_end - logical_rs->rs_start);
37027877fdebSMatt Macy }
37037877fdebSMatt Macy 
3704e716630dSMartin Matuska static void
3705e716630dSMartin Matuska raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3706e716630dSMartin Matuska {
3707e716630dSMartin Matuska 	spa_t *spa = arg;
3708e716630dSMartin Matuska 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3709e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3710e716630dSMartin Matuska 
3711e716630dSMartin Matuska 	/*
3712e716630dSMartin Matuska 	 * Ensure there are no i/os to the range that is being committed.
3713e716630dSMartin Matuska 	 */
3714e716630dSMartin Matuska 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3715e716630dSMartin Matuska 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3716e716630dSMartin Matuska 
3717e716630dSMartin Matuska 	mutex_enter(&vre->vre_lock);
3718e716630dSMartin Matuska 	uint64_t new_offset =
3719e716630dSMartin Matuska 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3720e716630dSMartin Matuska 	/*
3721e716630dSMartin Matuska 	 * We should not have committed anything that failed.
3722e716630dSMartin Matuska 	 */
3723e716630dSMartin Matuska 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3724e716630dSMartin Matuska 	mutex_exit(&vre->vre_lock);
3725e716630dSMartin Matuska 
3726e716630dSMartin Matuska 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3727e716630dSMartin Matuska 	    old_offset, new_offset - old_offset,
3728e716630dSMartin Matuska 	    RL_WRITER);
3729e716630dSMartin Matuska 
3730e716630dSMartin Matuska 	/*
3731e716630dSMartin Matuska 	 * Update the uberblock that will be written when this txg completes.
3732e716630dSMartin Matuska 	 */
3733e716630dSMartin Matuska 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3734e716630dSMartin Matuska 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3735e716630dSMartin Matuska 	vre->vre_offset_pertxg[txgoff] = 0;
3736e716630dSMartin Matuska 	zfs_rangelock_exit(lr);
3737e716630dSMartin Matuska 
3738e716630dSMartin Matuska 	mutex_enter(&vre->vre_lock);
3739e716630dSMartin Matuska 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3740e716630dSMartin Matuska 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
3741e716630dSMartin Matuska 	mutex_exit(&vre->vre_lock);
3742e716630dSMartin Matuska 
3743e716630dSMartin Matuska 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3744e716630dSMartin Matuska 	VERIFY0(zap_update(spa->spa_meta_objset,
3745e716630dSMartin Matuska 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3746e716630dSMartin Matuska 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3747e716630dSMartin Matuska }
3748e716630dSMartin Matuska 
3749e716630dSMartin Matuska static void
3750e716630dSMartin Matuska raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3751e716630dSMartin Matuska {
3752e716630dSMartin Matuska 	spa_t *spa = arg;
3753e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3754e716630dSMartin Matuska 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3755e716630dSMartin Matuska 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3756e716630dSMartin Matuska 
3757e716630dSMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++)
3758e716630dSMartin Matuska 		VERIFY0(vre->vre_offset_pertxg[i]);
3759e716630dSMartin Matuska 
3760e716630dSMartin Matuska 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3761e716630dSMartin Matuska 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3762e716630dSMartin Matuska 	re->re_logical_width = vdrz->vd_physical_width;
3763e716630dSMartin Matuska 	mutex_enter(&vdrz->vd_expand_lock);
3764e716630dSMartin Matuska 	avl_add(&vdrz->vd_expand_txgs, re);
3765e716630dSMartin Matuska 	mutex_exit(&vdrz->vd_expand_lock);
3766e716630dSMartin Matuska 
3767e716630dSMartin Matuska 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3768e716630dSMartin Matuska 
3769e716630dSMartin Matuska 	/*
3770e716630dSMartin Matuska 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3771e716630dSMartin Matuska 	 * will get written (based on vd_expand_txgs).
3772e716630dSMartin Matuska 	 */
3773e716630dSMartin Matuska 	vdev_config_dirty(vd);
3774e716630dSMartin Matuska 
3775e716630dSMartin Matuska 	/*
3776e716630dSMartin Matuska 	 * Before we change vre_state, the on-disk state must reflect that we
3777e716630dSMartin Matuska 	 * have completed all copying, so that vdev_raidz_io_start() can use
3778e716630dSMartin Matuska 	 * vre_state to determine if the reflow is in progress.  See also the
3779e716630dSMartin Matuska 	 * end of spa_raidz_expand_thread().
3780e716630dSMartin Matuska 	 */
3781e716630dSMartin Matuska 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3782e716630dSMartin Matuska 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3783e716630dSMartin Matuska 
3784e716630dSMartin Matuska 	vre->vre_end_time = gethrestime_sec();
3785e716630dSMartin Matuska 	vre->vre_state = DSS_FINISHED;
3786e716630dSMartin Matuska 
3787e716630dSMartin Matuska 	uint64_t state = vre->vre_state;
3788e716630dSMartin Matuska 	VERIFY0(zap_update(spa->spa_meta_objset,
3789e716630dSMartin Matuska 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3790e716630dSMartin Matuska 	    sizeof (state), 1, &state, tx));
3791e716630dSMartin Matuska 
3792e716630dSMartin Matuska 	uint64_t end_time = vre->vre_end_time;
3793e716630dSMartin Matuska 	VERIFY0(zap_update(spa->spa_meta_objset,
3794e716630dSMartin Matuska 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3795e716630dSMartin Matuska 	    sizeof (end_time), 1, &end_time, tx));
3796e716630dSMartin Matuska 
3797e716630dSMartin Matuska 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
3798e716630dSMartin Matuska 
3799e716630dSMartin Matuska 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3800e716630dSMartin Matuska 	    "%s vdev %llu new width %llu", spa_name(spa),
3801e716630dSMartin Matuska 	    (unsigned long long)vd->vdev_id,
3802e716630dSMartin Matuska 	    (unsigned long long)vd->vdev_children);
3803e716630dSMartin Matuska 
3804e716630dSMartin Matuska 	spa->spa_raidz_expand = NULL;
3805e716630dSMartin Matuska 	raidvd->vdev_rz_expanding = B_FALSE;
3806e716630dSMartin Matuska 
3807e716630dSMartin Matuska 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3808e716630dSMartin Matuska 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3809e716630dSMartin Matuska 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3810e716630dSMartin Matuska 
3811e716630dSMartin Matuska 	spa_notify_waiters(spa);
3812e716630dSMartin Matuska 
3813e716630dSMartin Matuska 	/*
3814e716630dSMartin Matuska 	 * While we're in syncing context take the opportunity to
3815e716630dSMartin Matuska 	 * setup a scrub. All the data has been sucessfully copied
3816e716630dSMartin Matuska 	 * but we have not validated any checksums.
3817e716630dSMartin Matuska 	 */
381817aab35aSMartin Matuska 	setup_sync_arg_t setup_sync_arg = {
381917aab35aSMartin Matuska 		.func = POOL_SCAN_SCRUB,
382017aab35aSMartin Matuska 		.txgstart = 0,
382117aab35aSMartin Matuska 		.txgend = 0,
382217aab35aSMartin Matuska 	};
382317aab35aSMartin Matuska 	if (zfs_scrub_after_expand &&
382417aab35aSMartin Matuska 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
382517aab35aSMartin Matuska 		dsl_scan_setup_sync(&setup_sync_arg, tx);
382617aab35aSMartin Matuska 	}
3827e716630dSMartin Matuska }
3828e716630dSMartin Matuska 
3829e716630dSMartin Matuska /*
383017aab35aSMartin Matuska  * State of one copy batch.
3831e716630dSMartin Matuska  */
3832e716630dSMartin Matuska typedef struct raidz_reflow_arg {
383317aab35aSMartin Matuska 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
383417aab35aSMartin Matuska 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
383517aab35aSMartin Matuska 	uint64_t rra_txg;	/* TXG of this batch. */
383617aab35aSMartin Matuska 	uint_t rra_ashift;	/* Ashift of the vdev. */
383717aab35aSMartin Matuska 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
383817aab35aSMartin Matuska 	uint32_t rra_writes;	/* Number of write ZIOs. */
383917aab35aSMartin Matuska 	zio_t *rra_zio[];	/* Write ZIO pointers. */
3840e716630dSMartin Matuska } raidz_reflow_arg_t;
3841e716630dSMartin Matuska 
3842e716630dSMartin Matuska /*
384317aab35aSMartin Matuska  * Write of the new location on one child is done.  Once all of them are done
384417aab35aSMartin Matuska  * we can unlock and free everything.
3845e716630dSMartin Matuska  */
3846e716630dSMartin Matuska static void
3847e716630dSMartin Matuska raidz_reflow_write_done(zio_t *zio)
3848e716630dSMartin Matuska {
3849e716630dSMartin Matuska 	raidz_reflow_arg_t *rra = zio->io_private;
3850e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = rra->rra_vre;
3851e716630dSMartin Matuska 
3852e716630dSMartin Matuska 	abd_free(zio->io_abd);
3853e716630dSMartin Matuska 
3854e716630dSMartin Matuska 	mutex_enter(&vre->vre_lock);
3855e716630dSMartin Matuska 	if (zio->io_error != 0) {
3856e716630dSMartin Matuska 		/* Force a reflow pause on errors */
3857e716630dSMartin Matuska 		vre->vre_failed_offset =
3858e716630dSMartin Matuska 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3859e716630dSMartin Matuska 	}
3860e716630dSMartin Matuska 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3861e716630dSMartin Matuska 	vre->vre_outstanding_bytes -= zio->io_size;
3862e716630dSMartin Matuska 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3863e716630dSMartin Matuska 	    vre->vre_failed_offset) {
3864e716630dSMartin Matuska 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3865e716630dSMartin Matuska 		    zio->io_size;
3866e716630dSMartin Matuska 	}
3867e716630dSMartin Matuska 	cv_signal(&vre->vre_cv);
386817aab35aSMartin Matuska 	boolean_t done = (--rra->rra_tbd == 0);
3869e716630dSMartin Matuska 	mutex_exit(&vre->vre_lock);
3870e716630dSMartin Matuska 
387117aab35aSMartin Matuska 	if (!done)
387217aab35aSMartin Matuska 		return;
3873e716630dSMartin Matuska 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
387417aab35aSMartin Matuska 	zfs_rangelock_exit(rra->rra_lr);
387517aab35aSMartin Matuska 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
3876e716630dSMartin Matuska }
3877e716630dSMartin Matuska 
3878e716630dSMartin Matuska /*
387917aab35aSMartin Matuska  * Read of the old location on one child is done.  Once all of them are done
388017aab35aSMartin Matuska  * writes should have all the data and we can issue them.
3881e716630dSMartin Matuska  */
3882e716630dSMartin Matuska static void
3883e716630dSMartin Matuska raidz_reflow_read_done(zio_t *zio)
3884e716630dSMartin Matuska {
3885e716630dSMartin Matuska 	raidz_reflow_arg_t *rra = zio->io_private;
3886e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = rra->rra_vre;
3887e716630dSMartin Matuska 
388817aab35aSMartin Matuska 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
388917aab35aSMartin Matuska 	if (zio->io_size > (1 << rra->rra_ashift))
389017aab35aSMartin Matuska 		abd_free(zio->io_abd);
389117aab35aSMartin Matuska 
3892e716630dSMartin Matuska 	/*
3893e716630dSMartin Matuska 	 * If the read failed, or if it was done on a vdev that is not fully
3894e716630dSMartin Matuska 	 * healthy (e.g. a child that has a resilver in progress), we may not
3895e716630dSMartin Matuska 	 * have the correct data.  Note that it's OK if the write proceeds.
3896e716630dSMartin Matuska 	 * It may write garbage but the location is otherwise unused and we
3897e716630dSMartin Matuska 	 * will retry later due to vre_failed_offset.
3898e716630dSMartin Matuska 	 */
3899e716630dSMartin Matuska 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3900e716630dSMartin Matuska 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3901e716630dSMartin Matuska 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3902e716630dSMartin Matuska 		    (long long)rra->rra_lr->lr_offset,
3903e716630dSMartin Matuska 		    (long long)rra->rra_lr->lr_length,
3904e716630dSMartin Matuska 		    (long long)rra->rra_txg,
3905e716630dSMartin Matuska 		    zio->io_error,
3906e716630dSMartin Matuska 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3907e716630dSMartin Matuska 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3908e716630dSMartin Matuska 		mutex_enter(&vre->vre_lock);
3909e716630dSMartin Matuska 		/* Force a reflow pause on errors */
3910e716630dSMartin Matuska 		vre->vre_failed_offset =
3911e716630dSMartin Matuska 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3912e716630dSMartin Matuska 		mutex_exit(&vre->vre_lock);
3913e716630dSMartin Matuska 	}
3914e716630dSMartin Matuska 
391517aab35aSMartin Matuska 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
391617aab35aSMartin Matuska 		return;
3917*dd215568SMartin Matuska 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
3918*dd215568SMartin Matuska 	for (uint64_t i = 0; i < writes; i++)
391917aab35aSMartin Matuska 		zio_nowait(rra->rra_zio[i]);
3920e716630dSMartin Matuska }
3921e716630dSMartin Matuska 
3922e716630dSMartin Matuska static void
3923e716630dSMartin Matuska raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3924e716630dSMartin Matuska     dmu_tx_t *tx)
3925e716630dSMartin Matuska {
3926e716630dSMartin Matuska 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3927e716630dSMartin Matuska 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3928e716630dSMartin Matuska 
3929e716630dSMartin Matuska 	if (offset == 0)
3930e716630dSMartin Matuska 		return;
3931e716630dSMartin Matuska 
3932e716630dSMartin Matuska 	mutex_enter(&vre->vre_lock);
3933e716630dSMartin Matuska 	ASSERT3U(vre->vre_offset, <=, offset);
3934e716630dSMartin Matuska 	vre->vre_offset = offset;
3935e716630dSMartin Matuska 	mutex_exit(&vre->vre_lock);
3936e716630dSMartin Matuska 
3937e716630dSMartin Matuska 	if (vre->vre_offset_pertxg[txgoff] == 0) {
3938e716630dSMartin Matuska 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3939e716630dSMartin Matuska 		    spa, tx);
3940e716630dSMartin Matuska 	}
3941e716630dSMartin Matuska 	vre->vre_offset_pertxg[txgoff] = offset;
3942e716630dSMartin Matuska }
3943e716630dSMartin Matuska 
3944e716630dSMartin Matuska static boolean_t
3945e716630dSMartin Matuska vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3946e716630dSMartin Matuska {
3947e716630dSMartin Matuska 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
3948e716630dSMartin Matuska 		/* Quick check if a child is being replaced */
3949e716630dSMartin Matuska 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3950e716630dSMartin Matuska 			return (B_TRUE);
3951e716630dSMartin Matuska 	}
3952e716630dSMartin Matuska 	return (B_FALSE);
3953e716630dSMartin Matuska }
3954e716630dSMartin Matuska 
3955e716630dSMartin Matuska static boolean_t
3956e716630dSMartin Matuska raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3957e716630dSMartin Matuska     dmu_tx_t *tx)
3958e716630dSMartin Matuska {
3959e716630dSMartin Matuska 	spa_t *spa = vd->vdev_spa;
396017aab35aSMartin Matuska 	uint_t ashift = vd->vdev_top->vdev_ashift;
3961e716630dSMartin Matuska 
396217aab35aSMartin Matuska 	range_seg_t *rs = range_tree_first(rt);
396317aab35aSMartin Matuska 	if (rt == NULL)
3964e716630dSMartin Matuska 		return (B_FALSE);
396517aab35aSMartin Matuska 	uint64_t offset = rs_get_start(rs, rt);
3966e716630dSMartin Matuska 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
396717aab35aSMartin Matuska 	uint64_t size = rs_get_end(rs, rt) - offset;
3968e716630dSMartin Matuska 	ASSERT3U(size, >=, 1 << ashift);
396917aab35aSMartin Matuska 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
3970e716630dSMartin Matuska 
3971e716630dSMartin Matuska 	uint64_t blkid = offset >> ashift;
397217aab35aSMartin Matuska 	uint_t old_children = vd->vdev_children - 1;
3973e716630dSMartin Matuska 
3974e716630dSMartin Matuska 	/*
3975e716630dSMartin Matuska 	 * We can only progress to the point that writes will not overlap
3976e716630dSMartin Matuska 	 * with blocks whose progress has not yet been recorded on disk.
3977e716630dSMartin Matuska 	 * Since partially-copied rows are still read from the old location,
3978e716630dSMartin Matuska 	 * we need to stop one row before the sector-wise overlap, to prevent
3979e716630dSMartin Matuska 	 * row-wise overlap.
3980e716630dSMartin Matuska 	 *
3981e716630dSMartin Matuska 	 * Note that even if we are skipping over a large unallocated region,
3982e716630dSMartin Matuska 	 * we can't move the on-disk progress to `offset`, because concurrent
3983e716630dSMartin Matuska 	 * writes/allocations could still use the currently-unallocated
3984e716630dSMartin Matuska 	 * region.
3985e716630dSMartin Matuska 	 */
3986e716630dSMartin Matuska 	uint64_t ubsync_blkid =
3987e716630dSMartin Matuska 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3988e716630dSMartin Matuska 	uint64_t next_overwrite_blkid = ubsync_blkid +
3989e716630dSMartin Matuska 	    ubsync_blkid / old_children - old_children;
3990e716630dSMartin Matuska 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3991e716630dSMartin Matuska 	if (blkid >= next_overwrite_blkid) {
3992e716630dSMartin Matuska 		raidz_reflow_record_progress(vre,
3993e716630dSMartin Matuska 		    next_overwrite_blkid << ashift, tx);
3994e716630dSMartin Matuska 		return (B_TRUE);
3995e716630dSMartin Matuska 	}
3996e716630dSMartin Matuska 
399717aab35aSMartin Matuska 	size = MIN(size, raidz_expand_max_copy_bytes);
399817aab35aSMartin Matuska 	size = MIN(size, (uint64_t)old_children *
399917aab35aSMartin Matuska 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
400017aab35aSMartin Matuska 	size = MAX(size, 1 << ashift);
400117aab35aSMartin Matuska 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
400217aab35aSMartin Matuska 	size = (uint64_t)blocks << ashift;
4003e716630dSMartin Matuska 
400417aab35aSMartin Matuska 	range_tree_remove(rt, offset, size);
400517aab35aSMartin Matuska 
400617aab35aSMartin Matuska 	uint_t reads = MIN(blocks, old_children);
400717aab35aSMartin Matuska 	uint_t writes = MIN(blocks, vd->vdev_children);
400817aab35aSMartin Matuska 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
400917aab35aSMartin Matuska 	    sizeof (zio_t *) * writes, KM_SLEEP);
4010e716630dSMartin Matuska 	rra->rra_vre = vre;
4011e716630dSMartin Matuska 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
401217aab35aSMartin Matuska 	    offset, size, RL_WRITER);
4013e716630dSMartin Matuska 	rra->rra_txg = dmu_tx_get_txg(tx);
401417aab35aSMartin Matuska 	rra->rra_ashift = ashift;
401517aab35aSMartin Matuska 	rra->rra_tbd = reads;
401617aab35aSMartin Matuska 	rra->rra_writes = writes;
4017e716630dSMartin Matuska 
401817aab35aSMartin Matuska 	raidz_reflow_record_progress(vre, offset + size, tx);
4019e716630dSMartin Matuska 
4020e716630dSMartin Matuska 	/*
4021e716630dSMartin Matuska 	 * SCL_STATE will be released when the read and write are done,
4022e716630dSMartin Matuska 	 * by raidz_reflow_write_done().
4023e716630dSMartin Matuska 	 */
4024e716630dSMartin Matuska 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
4025e716630dSMartin Matuska 
4026e716630dSMartin Matuska 	/* check if a replacing vdev was added, if so treat it as an error */
4027e716630dSMartin Matuska 	if (vdev_raidz_expand_child_replacing(vd)) {
4028e716630dSMartin Matuska 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
4029e716630dSMartin Matuska 		    "offset=%llu txg=%llu",
4030e716630dSMartin Matuska 		    (long long)rra->rra_lr->lr_offset,
4031e716630dSMartin Matuska 		    (long long)rra->rra_txg);
4032e716630dSMartin Matuska 
4033e716630dSMartin Matuska 		mutex_enter(&vre->vre_lock);
4034e716630dSMartin Matuska 		vre->vre_failed_offset =
4035e716630dSMartin Matuska 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
4036e716630dSMartin Matuska 		cv_signal(&vre->vre_cv);
4037e716630dSMartin Matuska 		mutex_exit(&vre->vre_lock);
4038e716630dSMartin Matuska 
4039e716630dSMartin Matuska 		/* drop everything we acquired */
4040e716630dSMartin Matuska 		spa_config_exit(spa, SCL_STATE, spa);
404117aab35aSMartin Matuska 		zfs_rangelock_exit(rra->rra_lr);
404217aab35aSMartin Matuska 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
4043e716630dSMartin Matuska 		return (B_TRUE);
4044e716630dSMartin Matuska 	}
4045e716630dSMartin Matuska 
404617aab35aSMartin Matuska 	mutex_enter(&vre->vre_lock);
404717aab35aSMartin Matuska 	vre->vre_outstanding_bytes += size;
404817aab35aSMartin Matuska 	mutex_exit(&vre->vre_lock);
4049e716630dSMartin Matuska 
405017aab35aSMartin Matuska 	/* Allocate ABD and ZIO for each child we write. */
405117aab35aSMartin Matuska 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
405217aab35aSMartin Matuska 	zio_t *pio = spa->spa_txg_zio[txgoff];
405317aab35aSMartin Matuska 	uint_t b = blocks / vd->vdev_children;
405417aab35aSMartin Matuska 	uint_t bb = blocks % vd->vdev_children;
405517aab35aSMartin Matuska 	for (uint_t i = 0; i < writes; i++) {
405617aab35aSMartin Matuska 		uint_t n = b + (i < bb);
405717aab35aSMartin Matuska 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
405817aab35aSMartin Matuska 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
405917aab35aSMartin Matuska 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
406017aab35aSMartin Matuska 		    ((blkid + i) / vd->vdev_children) << ashift,
406117aab35aSMartin Matuska 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
406217aab35aSMartin Matuska 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
406317aab35aSMartin Matuska 	}
406417aab35aSMartin Matuska 
406517aab35aSMartin Matuska 	/*
406617aab35aSMartin Matuska 	 * Allocate and issue ZIO for each child we read.  For reads of only
406717aab35aSMartin Matuska 	 * one block we can use respective writer ABDs, since they will also
406817aab35aSMartin Matuska 	 * have only one block.  For bigger reads create gang ABDs and fill
406917aab35aSMartin Matuska 	 * them with respective blocks from writer ABDs.
407017aab35aSMartin Matuska 	 */
407117aab35aSMartin Matuska 	b = blocks / old_children;
407217aab35aSMartin Matuska 	bb = blocks % old_children;
407317aab35aSMartin Matuska 	for (uint_t i = 0; i < reads; i++) {
407417aab35aSMartin Matuska 		uint_t n = b + (i < bb);
407517aab35aSMartin Matuska 		abd_t *abd;
407617aab35aSMartin Matuska 		if (n > 1) {
407717aab35aSMartin Matuska 			abd = abd_alloc_gang();
407817aab35aSMartin Matuska 			for (uint_t j = 0; j < n; j++) {
407917aab35aSMartin Matuska 				uint_t b = j * old_children + i;
408017aab35aSMartin Matuska 				abd_t *cabd = abd_get_offset_size(
408117aab35aSMartin Matuska 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
408217aab35aSMartin Matuska 				    (b / vd->vdev_children) << ashift,
408317aab35aSMartin Matuska 				    1 << ashift);
408417aab35aSMartin Matuska 				abd_gang_add(abd, cabd, B_TRUE);
408517aab35aSMartin Matuska 			}
408617aab35aSMartin Matuska 		} else {
408717aab35aSMartin Matuska 			abd = rra->rra_zio[i]->io_abd;
408817aab35aSMartin Matuska 		}
408917aab35aSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL,
409017aab35aSMartin Matuska 		    vd->vdev_child[(blkid + i) % old_children],
409117aab35aSMartin Matuska 		    ((blkid + i) / old_children) << ashift, abd,
409217aab35aSMartin Matuska 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
409317aab35aSMartin Matuska 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
409417aab35aSMartin Matuska 	}
4095e716630dSMartin Matuska 
4096e716630dSMartin Matuska 	return (B_FALSE);
4097e716630dSMartin Matuska }
4098e716630dSMartin Matuska 
4099e716630dSMartin Matuska /*
4100e716630dSMartin Matuska  * For testing (ztest specific)
4101e716630dSMartin Matuska  */
4102e716630dSMartin Matuska static void
4103e716630dSMartin Matuska raidz_expand_pause(uint_t pause_point)
4104e716630dSMartin Matuska {
4105e716630dSMartin Matuska 	while (raidz_expand_pause_point != 0 &&
4106e716630dSMartin Matuska 	    raidz_expand_pause_point <= pause_point)
4107e716630dSMartin Matuska 		delay(hz);
4108e716630dSMartin Matuska }
4109e716630dSMartin Matuska 
4110e716630dSMartin Matuska static void
4111e716630dSMartin Matuska raidz_scratch_child_done(zio_t *zio)
4112e716630dSMartin Matuska {
4113e716630dSMartin Matuska 	zio_t *pio = zio->io_private;
4114e716630dSMartin Matuska 
4115e716630dSMartin Matuska 	mutex_enter(&pio->io_lock);
4116e716630dSMartin Matuska 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4117e716630dSMartin Matuska 	mutex_exit(&pio->io_lock);
4118e716630dSMartin Matuska }
4119e716630dSMartin Matuska 
4120e716630dSMartin Matuska /*
4121e716630dSMartin Matuska  * Reflow the beginning portion of the vdev into an intermediate scratch area
4122e716630dSMartin Matuska  * in memory and on disk. This operation must be persisted on disk before we
4123e716630dSMartin Matuska  * proceed to overwrite the beginning portion with the reflowed data.
4124e716630dSMartin Matuska  *
4125e716630dSMartin Matuska  * This multi-step task can fail to complete if disk errors are encountered
4126e716630dSMartin Matuska  * and we can return here after a pause (waiting for disk to become healthy).
4127e716630dSMartin Matuska  */
4128e716630dSMartin Matuska static void
4129e716630dSMartin Matuska raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4130e716630dSMartin Matuska {
4131e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = arg;
4132e716630dSMartin Matuska 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4133e716630dSMartin Matuska 	zio_t *pio;
4134e716630dSMartin Matuska 	int error;
4135e716630dSMartin Matuska 
4136e716630dSMartin Matuska 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4137e716630dSMartin Matuska 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4138e716630dSMartin Matuska 	int ashift = raidvd->vdev_ashift;
4139aca928a5SMartin Matuska 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4140aca928a5SMartin Matuska 	    uint64_t);
4141e716630dSMartin Matuska 	uint64_t logical_size = write_size * raidvd->vdev_children;
4142e716630dSMartin Matuska 	uint64_t read_size =
4143e716630dSMartin Matuska 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4144e716630dSMartin Matuska 	    1 << ashift);
4145e716630dSMartin Matuska 
4146e716630dSMartin Matuska 	/*
4147e716630dSMartin Matuska 	 * The scratch space must be large enough to get us to the point
4148e716630dSMartin Matuska 	 * that one row does not overlap itself when moved.  This is checked
4149e716630dSMartin Matuska 	 * by vdev_raidz_attach_check().
4150e716630dSMartin Matuska 	 */
4151e716630dSMartin Matuska 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4152e716630dSMartin Matuska 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4153e716630dSMartin Matuska 	VERIFY3U(write_size, <=, read_size);
4154e716630dSMartin Matuska 
4155e716630dSMartin Matuska 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4156e716630dSMartin Matuska 	    0, logical_size, RL_WRITER);
4157e716630dSMartin Matuska 
4158e716630dSMartin Matuska 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4159e716630dSMartin Matuska 	    KM_SLEEP);
4160e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4161e716630dSMartin Matuska 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4162e716630dSMartin Matuska 	}
4163e716630dSMartin Matuska 
4164e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4165e716630dSMartin Matuska 
4166e716630dSMartin Matuska 	/*
4167e716630dSMartin Matuska 	 * If we have already written the scratch area then we must read from
4168e716630dSMartin Matuska 	 * there, since new writes were redirected there while we were paused
4169e716630dSMartin Matuska 	 * or the original location may have been partially overwritten with
4170e716630dSMartin Matuska 	 * reflowed data.
4171e716630dSMartin Matuska 	 */
4172e716630dSMartin Matuska 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4173e716630dSMartin Matuska 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4174e716630dSMartin Matuska 		/*
4175e716630dSMartin Matuska 		 * Read from scratch space.
4176e716630dSMartin Matuska 		 */
4177e716630dSMartin Matuska 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4178e716630dSMartin Matuska 		for (int i = 0; i < raidvd->vdev_children; i++) {
4179e716630dSMartin Matuska 			/*
4180e716630dSMartin Matuska 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4181e716630dSMartin Matuska 			 * to the offset to calculate the physical offset to
4182e716630dSMartin Matuska 			 * write to.  Passing in a negative offset makes us
4183e716630dSMartin Matuska 			 * access the scratch area.
4184e716630dSMartin Matuska 			 */
4185e716630dSMartin Matuska 			zio_nowait(zio_vdev_child_io(pio, NULL,
4186e716630dSMartin Matuska 			    raidvd->vdev_child[i],
4187e716630dSMartin Matuska 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
418817aab35aSMartin Matuska 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4189e716630dSMartin Matuska 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4190e716630dSMartin Matuska 		}
4191e716630dSMartin Matuska 		error = zio_wait(pio);
4192e716630dSMartin Matuska 		if (error != 0) {
4193e716630dSMartin Matuska 			zfs_dbgmsg("reflow: error %d reading scratch location",
4194e716630dSMartin Matuska 			    error);
4195e716630dSMartin Matuska 			goto io_error_exit;
4196e716630dSMartin Matuska 		}
4197e716630dSMartin Matuska 		goto overwrite;
4198e716630dSMartin Matuska 	}
4199e716630dSMartin Matuska 
4200e716630dSMartin Matuska 	/*
4201e716630dSMartin Matuska 	 * Read from original location.
4202e716630dSMartin Matuska 	 */
4203e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4204e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4205e716630dSMartin Matuska 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4206e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4207e716630dSMartin Matuska 		    0, abds[i], read_size, ZIO_TYPE_READ,
420817aab35aSMartin Matuska 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4209e716630dSMartin Matuska 		    raidz_scratch_child_done, pio));
4210e716630dSMartin Matuska 	}
4211e716630dSMartin Matuska 	error = zio_wait(pio);
4212e716630dSMartin Matuska 	if (error != 0) {
4213e716630dSMartin Matuska 		zfs_dbgmsg("reflow: error %d reading original location", error);
4214e716630dSMartin Matuska io_error_exit:
4215e716630dSMartin Matuska 		for (int i = 0; i < raidvd->vdev_children; i++)
4216e716630dSMartin Matuska 			abd_free(abds[i]);
4217e716630dSMartin Matuska 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4218e716630dSMartin Matuska 		zfs_rangelock_exit(lr);
4219e716630dSMartin Matuska 		spa_config_exit(spa, SCL_STATE, FTAG);
4220e716630dSMartin Matuska 		return;
4221e716630dSMartin Matuska 	}
4222e716630dSMartin Matuska 
4223e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4224e716630dSMartin Matuska 
4225e716630dSMartin Matuska 	/*
4226e716630dSMartin Matuska 	 * Reflow in memory.
4227e716630dSMartin Matuska 	 */
4228e716630dSMartin Matuska 	uint64_t logical_sectors = logical_size >> ashift;
4229e716630dSMartin Matuska 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4230e716630dSMartin Matuska 		int oldchild = i % (raidvd->vdev_children - 1);
4231e716630dSMartin Matuska 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4232e716630dSMartin Matuska 
4233e716630dSMartin Matuska 		int newchild = i % raidvd->vdev_children;
4234e716630dSMartin Matuska 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4235e716630dSMartin Matuska 
4236e716630dSMartin Matuska 		/* a single sector should not be copying over itself */
4237e716630dSMartin Matuska 		ASSERT(!(newchild == oldchild && newoff == oldoff));
4238e716630dSMartin Matuska 
4239e716630dSMartin Matuska 		abd_copy_off(abds[newchild], abds[oldchild],
4240e716630dSMartin Matuska 		    newoff, oldoff, 1 << ashift);
4241e716630dSMartin Matuska 	}
4242e716630dSMartin Matuska 
4243e716630dSMartin Matuska 	/*
4244e716630dSMartin Matuska 	 * Verify that we filled in everything we intended to (write_size on
4245e716630dSMartin Matuska 	 * each child).
4246e716630dSMartin Matuska 	 */
4247e716630dSMartin Matuska 	VERIFY0(logical_sectors % raidvd->vdev_children);
4248e716630dSMartin Matuska 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4249e716630dSMartin Matuska 	    write_size);
4250e716630dSMartin Matuska 
4251e716630dSMartin Matuska 	/*
4252e716630dSMartin Matuska 	 * Write to scratch location (boot area).
4253e716630dSMartin Matuska 	 */
4254e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4255e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4256e716630dSMartin Matuska 		/*
4257e716630dSMartin Matuska 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4258e716630dSMartin Matuska 		 * the offset to calculate the physical offset to write to.
4259e716630dSMartin Matuska 		 * Passing in a negative offset lets us access the boot area.
4260e716630dSMartin Matuska 		 */
4261e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4262e716630dSMartin Matuska 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
426317aab35aSMartin Matuska 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4264e716630dSMartin Matuska 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4265e716630dSMartin Matuska 	}
4266e716630dSMartin Matuska 	error = zio_wait(pio);
4267e716630dSMartin Matuska 	if (error != 0) {
4268e716630dSMartin Matuska 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4269e716630dSMartin Matuska 		goto io_error_exit;
4270e716630dSMartin Matuska 	}
4271e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, 0);
4272e716630dSMartin Matuska 	zio_flush(pio, raidvd);
4273e716630dSMartin Matuska 	zio_wait(pio);
4274e716630dSMartin Matuska 
4275e716630dSMartin Matuska 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4276e716630dSMartin Matuska 	    (long long)logical_size);
4277e716630dSMartin Matuska 
4278e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4279e716630dSMartin Matuska 
4280e716630dSMartin Matuska 	/*
4281e716630dSMartin Matuska 	 * Update uberblock to indicate that scratch space is valid.  This is
4282e716630dSMartin Matuska 	 * needed because after this point, the real location may be
4283e716630dSMartin Matuska 	 * overwritten.  If we crash, we need to get the data from the
4284e716630dSMartin Matuska 	 * scratch space, rather than the real location.
4285e716630dSMartin Matuska 	 *
4286e716630dSMartin Matuska 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4287e716630dSMartin Matuska 	 * will prefer this uberblock.
4288e716630dSMartin Matuska 	 */
4289e716630dSMartin Matuska 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4290e716630dSMartin Matuska 	spa->spa_ubsync.ub_timestamp++;
4291e716630dSMartin Matuska 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4292e716630dSMartin Matuska 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4293e716630dSMartin Matuska 	if (spa_multihost(spa))
4294e716630dSMartin Matuska 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4295e716630dSMartin Matuska 
4296e716630dSMartin Matuska 	zfs_dbgmsg("reflow: uberblock updated "
4297e716630dSMartin Matuska 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4298e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_txg,
4299e716630dSMartin Matuska 	    (long long)logical_size,
4300e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_timestamp);
4301e716630dSMartin Matuska 
4302e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4303e716630dSMartin Matuska 
4304e716630dSMartin Matuska 	/*
4305e716630dSMartin Matuska 	 * Overwrite with reflow'ed data.
4306e716630dSMartin Matuska 	 */
4307e716630dSMartin Matuska overwrite:
4308e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4309e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4310e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4311e716630dSMartin Matuska 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
431217aab35aSMartin Matuska 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
4313e716630dSMartin Matuska 		    raidz_scratch_child_done, pio));
4314e716630dSMartin Matuska 	}
4315e716630dSMartin Matuska 	error = zio_wait(pio);
4316e716630dSMartin Matuska 	if (error != 0) {
4317e716630dSMartin Matuska 		/*
4318e716630dSMartin Matuska 		 * When we exit early here and drop the range lock, new
4319e716630dSMartin Matuska 		 * writes will go into the scratch area so we'll need to
4320e716630dSMartin Matuska 		 * read from there when we return after pausing.
4321e716630dSMartin Matuska 		 */
4322e716630dSMartin Matuska 		zfs_dbgmsg("reflow: error %d writing real location", error);
4323e716630dSMartin Matuska 		/*
4324e716630dSMartin Matuska 		 * Update the uberblock that is written when this txg completes.
4325e716630dSMartin Matuska 		 */
4326e716630dSMartin Matuska 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4327e716630dSMartin Matuska 		    logical_size);
4328e716630dSMartin Matuska 		goto io_error_exit;
4329e716630dSMartin Matuska 	}
4330e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, 0);
4331e716630dSMartin Matuska 	zio_flush(pio, raidvd);
4332e716630dSMartin Matuska 	zio_wait(pio);
4333e716630dSMartin Matuska 
4334e716630dSMartin Matuska 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4335e716630dSMartin Matuska 	    (long long)logical_size);
4336e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++)
4337e716630dSMartin Matuska 		abd_free(abds[i]);
4338e716630dSMartin Matuska 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4339e716630dSMartin Matuska 
4340e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4341e716630dSMartin Matuska 
4342e716630dSMartin Matuska 	/*
4343e716630dSMartin Matuska 	 * Update uberblock to indicate that the initial part has been
4344e716630dSMartin Matuska 	 * reflow'ed.  This is needed because after this point (when we exit
4345e716630dSMartin Matuska 	 * the rangelock), we allow regular writes to this region, which will
4346e716630dSMartin Matuska 	 * be written to the new location only (because reflow_offset_next ==
4347e716630dSMartin Matuska 	 * reflow_offset_synced).  If we crashed and re-copied from the
4348e716630dSMartin Matuska 	 * scratch space, we would lose the regular writes.
4349e716630dSMartin Matuska 	 */
4350e716630dSMartin Matuska 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4351e716630dSMartin Matuska 	    logical_size);
4352e716630dSMartin Matuska 	spa->spa_ubsync.ub_timestamp++;
4353e716630dSMartin Matuska 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4354e716630dSMartin Matuska 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4355e716630dSMartin Matuska 	if (spa_multihost(spa))
4356e716630dSMartin Matuska 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4357e716630dSMartin Matuska 
4358e716630dSMartin Matuska 	zfs_dbgmsg("reflow: uberblock updated "
4359e716630dSMartin Matuska 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4360e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_txg,
4361e716630dSMartin Matuska 	    (long long)logical_size,
4362e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_timestamp);
4363e716630dSMartin Matuska 
4364e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4365e716630dSMartin Matuska 
4366e716630dSMartin Matuska 	/*
4367e716630dSMartin Matuska 	 * Update progress.
4368e716630dSMartin Matuska 	 */
4369e716630dSMartin Matuska 	vre->vre_offset = logical_size;
4370e716630dSMartin Matuska 	zfs_rangelock_exit(lr);
4371e716630dSMartin Matuska 	spa_config_exit(spa, SCL_STATE, FTAG);
4372e716630dSMartin Matuska 
4373e716630dSMartin Matuska 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4374e716630dSMartin Matuska 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4375e716630dSMartin Matuska 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4376e716630dSMartin Matuska 	/*
4377e716630dSMartin Matuska 	 * Note - raidz_reflow_sync() will update the uberblock state to
4378e716630dSMartin Matuska 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4379e716630dSMartin Matuska 	 */
4380e716630dSMartin Matuska 	raidz_reflow_sync(spa, tx);
4381e716630dSMartin Matuska 
4382e716630dSMartin Matuska 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4383e716630dSMartin Matuska }
4384e716630dSMartin Matuska 
4385e716630dSMartin Matuska /*
4386e716630dSMartin Matuska  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4387e716630dSMartin Matuska  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4388e716630dSMartin Matuska  */
4389e716630dSMartin Matuska void
4390e716630dSMartin Matuska vdev_raidz_reflow_copy_scratch(spa_t *spa)
4391e716630dSMartin Matuska {
4392e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4393e716630dSMartin Matuska 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4394e716630dSMartin Matuska 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4395e716630dSMartin Matuska 
4396e716630dSMartin Matuska 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4397e716630dSMartin Matuska 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4398e716630dSMartin Matuska 	ASSERT0(logical_size % raidvd->vdev_children);
4399e716630dSMartin Matuska 	uint64_t write_size = logical_size / raidvd->vdev_children;
4400e716630dSMartin Matuska 
4401e716630dSMartin Matuska 	zio_t *pio;
4402e716630dSMartin Matuska 
4403e716630dSMartin Matuska 	/*
4404e716630dSMartin Matuska 	 * Read from scratch space.
4405e716630dSMartin Matuska 	 */
4406e716630dSMartin Matuska 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4407e716630dSMartin Matuska 	    KM_SLEEP);
4408e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4409e716630dSMartin Matuska 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4410e716630dSMartin Matuska 	}
4411e716630dSMartin Matuska 
4412e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, 0);
4413e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4414e716630dSMartin Matuska 		/*
4415e716630dSMartin Matuska 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4416e716630dSMartin Matuska 		 * the offset to calculate the physical offset to write to.
4417e716630dSMartin Matuska 		 * Passing in a negative offset lets us access the boot area.
4418e716630dSMartin Matuska 		 */
4419e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4420e716630dSMartin Matuska 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
442117aab35aSMartin Matuska 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
4422e716630dSMartin Matuska 		    raidz_scratch_child_done, pio));
4423e716630dSMartin Matuska 	}
4424e716630dSMartin Matuska 	zio_wait(pio);
4425e716630dSMartin Matuska 
4426e716630dSMartin Matuska 	/*
4427e716630dSMartin Matuska 	 * Overwrite real location with reflow'ed data.
4428e716630dSMartin Matuska 	 */
4429e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, 0);
4430e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++) {
4431e716630dSMartin Matuska 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4432e716630dSMartin Matuska 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
443317aab35aSMartin Matuska 		    ZIO_PRIORITY_REMOVAL, 0,
4434e716630dSMartin Matuska 		    raidz_scratch_child_done, pio));
4435e716630dSMartin Matuska 	}
4436e716630dSMartin Matuska 	zio_wait(pio);
4437e716630dSMartin Matuska 	pio = zio_root(spa, NULL, NULL, 0);
4438e716630dSMartin Matuska 	zio_flush(pio, raidvd);
4439e716630dSMartin Matuska 	zio_wait(pio);
4440e716630dSMartin Matuska 
4441e716630dSMartin Matuska 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4442e716630dSMartin Matuska 	    "to real location", (long long)logical_size);
4443e716630dSMartin Matuska 
4444e716630dSMartin Matuska 	for (int i = 0; i < raidvd->vdev_children; i++)
4445e716630dSMartin Matuska 		abd_free(abds[i]);
4446e716630dSMartin Matuska 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4447e716630dSMartin Matuska 
4448e716630dSMartin Matuska 	/*
4449e716630dSMartin Matuska 	 * Update uberblock.
4450e716630dSMartin Matuska 	 */
4451e716630dSMartin Matuska 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4452e716630dSMartin Matuska 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4453e716630dSMartin Matuska 	spa->spa_ubsync.ub_timestamp++;
4454e716630dSMartin Matuska 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4455e716630dSMartin Matuska 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4456e716630dSMartin Matuska 	if (spa_multihost(spa))
4457e716630dSMartin Matuska 		mmp_update_uberblock(spa, &spa->spa_ubsync);
4458e716630dSMartin Matuska 
4459e716630dSMartin Matuska 	zfs_dbgmsg("reflow recovery: uberblock updated "
4460e716630dSMartin Matuska 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4461e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_txg,
4462e716630dSMartin Matuska 	    (long long)logical_size,
4463e716630dSMartin Matuska 	    (long long)spa->spa_ubsync.ub_timestamp);
4464e716630dSMartin Matuska 
4465e716630dSMartin Matuska 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4466e716630dSMartin Matuska 	    spa_first_txg(spa));
4467e716630dSMartin Matuska 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4468e716630dSMartin Matuska 	vre->vre_offset = logical_size;
4469e716630dSMartin Matuska 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4470e716630dSMartin Matuska 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4471e716630dSMartin Matuska 	/*
4472e716630dSMartin Matuska 	 * Note that raidz_reflow_sync() will update the uberblock once more
4473e716630dSMartin Matuska 	 */
4474e716630dSMartin Matuska 	raidz_reflow_sync(spa, tx);
4475e716630dSMartin Matuska 
4476e716630dSMartin Matuska 	dmu_tx_commit(tx);
4477e716630dSMartin Matuska 
4478e716630dSMartin Matuska 	spa_config_exit(spa, SCL_STATE, FTAG);
4479e716630dSMartin Matuska }
4480e716630dSMartin Matuska 
4481e716630dSMartin Matuska static boolean_t
4482e716630dSMartin Matuska spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4483e716630dSMartin Matuska {
4484e716630dSMartin Matuska 	(void) zthr;
4485e716630dSMartin Matuska 	spa_t *spa = arg;
4486e716630dSMartin Matuska 
4487e716630dSMartin Matuska 	return (spa->spa_raidz_expand != NULL &&
4488e716630dSMartin Matuska 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4489e716630dSMartin Matuska }
4490e716630dSMartin Matuska 
4491e716630dSMartin Matuska /*
4492e716630dSMartin Matuska  * RAIDZ expansion background thread
4493e716630dSMartin Matuska  *
4494e716630dSMartin Matuska  * Can be called multiple times if the reflow is paused
4495e716630dSMartin Matuska  */
4496e716630dSMartin Matuska static void
4497e716630dSMartin Matuska spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4498e716630dSMartin Matuska {
4499e716630dSMartin Matuska 	spa_t *spa = arg;
4500e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4501e716630dSMartin Matuska 
4502e716630dSMartin Matuska 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4503e716630dSMartin Matuska 		vre->vre_offset = 0;
4504e716630dSMartin Matuska 	else
4505e716630dSMartin Matuska 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4506e716630dSMartin Matuska 
4507e716630dSMartin Matuska 	/* Reflow the begining portion using the scratch area */
4508e716630dSMartin Matuska 	if (vre->vre_offset == 0) {
4509e716630dSMartin Matuska 		VERIFY0(dsl_sync_task(spa_name(spa),
4510e716630dSMartin Matuska 		    NULL, raidz_reflow_scratch_sync,
4511e716630dSMartin Matuska 		    vre, 0, ZFS_SPACE_CHECK_NONE));
4512e716630dSMartin Matuska 
4513e716630dSMartin Matuska 		/* if we encountered errors then pause */
4514e716630dSMartin Matuska 		if (vre->vre_offset == 0) {
4515e716630dSMartin Matuska 			mutex_enter(&vre->vre_lock);
4516e716630dSMartin Matuska 			vre->vre_waiting_for_resilver = B_TRUE;
4517e716630dSMartin Matuska 			mutex_exit(&vre->vre_lock);
4518e716630dSMartin Matuska 			return;
4519e716630dSMartin Matuska 		}
4520e716630dSMartin Matuska 	}
4521e716630dSMartin Matuska 
4522e716630dSMartin Matuska 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4523e716630dSMartin Matuska 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4524e716630dSMartin Matuska 
4525e716630dSMartin Matuska 	uint64_t guid = raidvd->vdev_guid;
4526e716630dSMartin Matuska 
4527e716630dSMartin Matuska 	/* Iterate over all the remaining metaslabs */
4528e716630dSMartin Matuska 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4529e716630dSMartin Matuska 	    i < raidvd->vdev_ms_count &&
4530e716630dSMartin Matuska 	    !zthr_iscancelled(zthr) &&
4531e716630dSMartin Matuska 	    vre->vre_failed_offset == UINT64_MAX; i++) {
4532e716630dSMartin Matuska 		metaslab_t *msp = raidvd->vdev_ms[i];
4533e716630dSMartin Matuska 
4534e716630dSMartin Matuska 		metaslab_disable(msp);
4535e716630dSMartin Matuska 		mutex_enter(&msp->ms_lock);
4536e716630dSMartin Matuska 
4537e716630dSMartin Matuska 		/*
4538e716630dSMartin Matuska 		 * The metaslab may be newly created (for the expanded
4539e716630dSMartin Matuska 		 * space), in which case its trees won't exist yet,
4540e716630dSMartin Matuska 		 * so we need to bail out early.
4541e716630dSMartin Matuska 		 */
4542e716630dSMartin Matuska 		if (msp->ms_new) {
4543e716630dSMartin Matuska 			mutex_exit(&msp->ms_lock);
4544e716630dSMartin Matuska 			metaslab_enable(msp, B_FALSE, B_FALSE);
4545e716630dSMartin Matuska 			continue;
4546e716630dSMartin Matuska 		}
4547e716630dSMartin Matuska 
4548e716630dSMartin Matuska 		VERIFY0(metaslab_load(msp));
4549e716630dSMartin Matuska 
4550e716630dSMartin Matuska 		/*
4551e716630dSMartin Matuska 		 * We want to copy everything except the free (allocatable)
4552e716630dSMartin Matuska 		 * space.  Note that there may be a little bit more free
4553e716630dSMartin Matuska 		 * space (e.g. in ms_defer), and it's fine to copy that too.
4554e716630dSMartin Matuska 		 */
455517aab35aSMartin Matuska 		uint64_t shift, start;
455617aab35aSMartin Matuska 		range_seg_type_t type = metaslab_calculate_range_tree_type(
455717aab35aSMartin Matuska 		    raidvd, msp, &start, &shift);
455817aab35aSMartin Matuska 		range_tree_t *rt = range_tree_create(NULL, type, NULL,
455917aab35aSMartin Matuska 		    start, shift);
4560e716630dSMartin Matuska 		range_tree_add(rt, msp->ms_start, msp->ms_size);
4561e716630dSMartin Matuska 		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4562e716630dSMartin Matuska 		mutex_exit(&msp->ms_lock);
4563e716630dSMartin Matuska 
4564e716630dSMartin Matuska 		/*
4565e716630dSMartin Matuska 		 * Force the last sector of each metaslab to be copied.  This
4566e716630dSMartin Matuska 		 * ensures that we advance the on-disk progress to the end of
4567e716630dSMartin Matuska 		 * this metaslab while the metaslab is disabled.  Otherwise, we
4568e716630dSMartin Matuska 		 * could move past this metaslab without advancing the on-disk
4569e716630dSMartin Matuska 		 * progress, and then an allocation to this metaslab would not
4570e716630dSMartin Matuska 		 * be copied.
4571e716630dSMartin Matuska 		 */
4572e716630dSMartin Matuska 		int sectorsz = 1 << raidvd->vdev_ashift;
4573e716630dSMartin Matuska 		uint64_t ms_last_offset = msp->ms_start +
4574e716630dSMartin Matuska 		    msp->ms_size - sectorsz;
4575e716630dSMartin Matuska 		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4576e716630dSMartin Matuska 			range_tree_add(rt, ms_last_offset, sectorsz);
4577e716630dSMartin Matuska 		}
4578e716630dSMartin Matuska 
4579e716630dSMartin Matuska 		/*
4580e716630dSMartin Matuska 		 * When we are resuming from a paused expansion (i.e.
4581e716630dSMartin Matuska 		 * when importing a pool with a expansion in progress),
4582e716630dSMartin Matuska 		 * discard any state that we have already processed.
4583e716630dSMartin Matuska 		 */
458417aab35aSMartin Matuska 		if (vre->vre_offset > msp->ms_start) {
458517aab35aSMartin Matuska 			range_tree_clear(rt, msp->ms_start,
458617aab35aSMartin Matuska 			    vre->vre_offset - msp->ms_start);
458717aab35aSMartin Matuska 		}
4588e716630dSMartin Matuska 
4589e716630dSMartin Matuska 		while (!zthr_iscancelled(zthr) &&
4590e716630dSMartin Matuska 		    !range_tree_is_empty(rt) &&
4591e716630dSMartin Matuska 		    vre->vre_failed_offset == UINT64_MAX) {
4592e716630dSMartin Matuska 
4593e716630dSMartin Matuska 			/*
4594e716630dSMartin Matuska 			 * We need to periodically drop the config lock so that
4595e716630dSMartin Matuska 			 * writers can get in.  Additionally, we can't wait
4596e716630dSMartin Matuska 			 * for a txg to sync while holding a config lock
4597e716630dSMartin Matuska 			 * (since a waiting writer could cause a 3-way deadlock
4598e716630dSMartin Matuska 			 * with the sync thread, which also gets a config
4599e716630dSMartin Matuska 			 * lock for reader).  So we can't hold the config lock
4600e716630dSMartin Matuska 			 * while calling dmu_tx_assign().
4601e716630dSMartin Matuska 			 */
4602e716630dSMartin Matuska 			spa_config_exit(spa, SCL_CONFIG, FTAG);
4603e716630dSMartin Matuska 
4604e716630dSMartin Matuska 			/*
4605e716630dSMartin Matuska 			 * If requested, pause the reflow when the amount
4606e716630dSMartin Matuska 			 * specified by raidz_expand_max_reflow_bytes is reached
4607e716630dSMartin Matuska 			 *
4608e716630dSMartin Matuska 			 * This pause is only used during testing or debugging.
4609e716630dSMartin Matuska 			 */
4610e716630dSMartin Matuska 			while (raidz_expand_max_reflow_bytes != 0 &&
4611e716630dSMartin Matuska 			    raidz_expand_max_reflow_bytes <=
4612e716630dSMartin Matuska 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4613e716630dSMartin Matuska 				delay(hz);
4614e716630dSMartin Matuska 			}
4615e716630dSMartin Matuska 
4616e716630dSMartin Matuska 			mutex_enter(&vre->vre_lock);
4617e716630dSMartin Matuska 			while (vre->vre_outstanding_bytes >
4618e716630dSMartin Matuska 			    raidz_expand_max_copy_bytes) {
4619e716630dSMartin Matuska 				cv_wait(&vre->vre_cv, &vre->vre_lock);
4620e716630dSMartin Matuska 			}
4621e716630dSMartin Matuska 			mutex_exit(&vre->vre_lock);
4622e716630dSMartin Matuska 
4623e716630dSMartin Matuska 			dmu_tx_t *tx =
4624e716630dSMartin Matuska 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4625e716630dSMartin Matuska 
4626e716630dSMartin Matuska 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4627e716630dSMartin Matuska 			uint64_t txg = dmu_tx_get_txg(tx);
4628e716630dSMartin Matuska 
4629e716630dSMartin Matuska 			/*
4630e716630dSMartin Matuska 			 * Reacquire the vdev_config lock.  Theoretically, the
4631e716630dSMartin Matuska 			 * vdev_t that we're expanding may have changed.
4632e716630dSMartin Matuska 			 */
4633e716630dSMartin Matuska 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4634e716630dSMartin Matuska 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4635e716630dSMartin Matuska 
4636e716630dSMartin Matuska 			boolean_t needsync =
4637e716630dSMartin Matuska 			    raidz_reflow_impl(raidvd, vre, rt, tx);
4638e716630dSMartin Matuska 
4639e716630dSMartin Matuska 			dmu_tx_commit(tx);
4640e716630dSMartin Matuska 
4641e716630dSMartin Matuska 			if (needsync) {
4642e716630dSMartin Matuska 				spa_config_exit(spa, SCL_CONFIG, FTAG);
4643e716630dSMartin Matuska 				txg_wait_synced(spa->spa_dsl_pool, txg);
4644e716630dSMartin Matuska 				spa_config_enter(spa, SCL_CONFIG, FTAG,
4645e716630dSMartin Matuska 				    RW_READER);
4646e716630dSMartin Matuska 			}
4647e716630dSMartin Matuska 		}
4648e716630dSMartin Matuska 
4649e716630dSMartin Matuska 		spa_config_exit(spa, SCL_CONFIG, FTAG);
4650e716630dSMartin Matuska 
4651e716630dSMartin Matuska 		metaslab_enable(msp, B_FALSE, B_FALSE);
4652e716630dSMartin Matuska 		range_tree_vacate(rt, NULL, NULL);
4653e716630dSMartin Matuska 		range_tree_destroy(rt);
4654e716630dSMartin Matuska 
4655e716630dSMartin Matuska 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4656e716630dSMartin Matuska 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4657e716630dSMartin Matuska 	}
4658e716630dSMartin Matuska 
4659e716630dSMartin Matuska 	spa_config_exit(spa, SCL_CONFIG, FTAG);
4660e716630dSMartin Matuska 
4661e716630dSMartin Matuska 	/*
4662e716630dSMartin Matuska 	 * The txg_wait_synced() here ensures that all reflow zio's have
4663e716630dSMartin Matuska 	 * completed, and vre_failed_offset has been set if necessary.  It
4664e716630dSMartin Matuska 	 * also ensures that the progress of the last raidz_reflow_sync() is
4665e716630dSMartin Matuska 	 * written to disk before raidz_reflow_complete_sync() changes the
4666e716630dSMartin Matuska 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4667e716630dSMartin Matuska 	 * determine if a reflow is in progress, in which case we may need to
4668e716630dSMartin Matuska 	 * write to both old and new locations.  Therefore we can only change
4669e716630dSMartin Matuska 	 * vre_state once this is not necessary, which is once the on-disk
4670e716630dSMartin Matuska 	 * progress (in spa_ubsync) has been set past any possible writes (to
4671e716630dSMartin Matuska 	 * the end of the last metaslab).
4672e716630dSMartin Matuska 	 */
4673e716630dSMartin Matuska 	txg_wait_synced(spa->spa_dsl_pool, 0);
4674e716630dSMartin Matuska 
4675e716630dSMartin Matuska 	if (!zthr_iscancelled(zthr) &&
4676e716630dSMartin Matuska 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4677e716630dSMartin Matuska 		/*
4678e716630dSMartin Matuska 		 * We are not being canceled or paused, so the reflow must be
4679e716630dSMartin Matuska 		 * complete. In that case also mark it as completed on disk.
4680e716630dSMartin Matuska 		 */
4681e716630dSMartin Matuska 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4682e716630dSMartin Matuska 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4683e716630dSMartin Matuska 		    raidz_reflow_complete_sync, spa,
4684e716630dSMartin Matuska 		    0, ZFS_SPACE_CHECK_NONE));
4685e716630dSMartin Matuska 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4686e716630dSMartin Matuska 	} else {
4687e716630dSMartin Matuska 		/*
4688e716630dSMartin Matuska 		 * Wait for all copy zio's to complete and for all the
4689e716630dSMartin Matuska 		 * raidz_reflow_sync() synctasks to be run.
4690e716630dSMartin Matuska 		 */
4691e716630dSMartin Matuska 		spa_history_log_internal(spa, "reflow pause",
4692e716630dSMartin Matuska 		    NULL, "offset=%llu failed_offset=%lld",
4693e716630dSMartin Matuska 		    (long long)vre->vre_offset,
4694e716630dSMartin Matuska 		    (long long)vre->vre_failed_offset);
4695e716630dSMartin Matuska 		mutex_enter(&vre->vre_lock);
4696e716630dSMartin Matuska 		if (vre->vre_failed_offset != UINT64_MAX) {
4697e716630dSMartin Matuska 			/*
4698e716630dSMartin Matuska 			 * Reset progress so that we will retry everything
4699e716630dSMartin Matuska 			 * after the point that something failed.
4700e716630dSMartin Matuska 			 */
4701e716630dSMartin Matuska 			vre->vre_offset = vre->vre_failed_offset;
4702e716630dSMartin Matuska 			vre->vre_failed_offset = UINT64_MAX;
4703e716630dSMartin Matuska 			vre->vre_waiting_for_resilver = B_TRUE;
4704e716630dSMartin Matuska 		}
4705e716630dSMartin Matuska 		mutex_exit(&vre->vre_lock);
4706e716630dSMartin Matuska 	}
4707e716630dSMartin Matuska }
4708e716630dSMartin Matuska 
4709e716630dSMartin Matuska void
4710e716630dSMartin Matuska spa_start_raidz_expansion_thread(spa_t *spa)
4711e716630dSMartin Matuska {
4712e716630dSMartin Matuska 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4713e716630dSMartin Matuska 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4714e716630dSMartin Matuska 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4715e716630dSMartin Matuska 	    spa, defclsyspri);
4716e716630dSMartin Matuska }
4717e716630dSMartin Matuska 
4718e716630dSMartin Matuska void
4719e716630dSMartin Matuska raidz_dtl_reassessed(vdev_t *vd)
4720e716630dSMartin Matuska {
4721e716630dSMartin Matuska 	spa_t *spa = vd->vdev_spa;
4722e716630dSMartin Matuska 	if (spa->spa_raidz_expand != NULL) {
4723e716630dSMartin Matuska 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4724e716630dSMartin Matuska 		/*
4725e716630dSMartin Matuska 		 * we get called often from vdev_dtl_reassess() so make
4726e716630dSMartin Matuska 		 * sure it's our vdev and any replacing is complete
4727e716630dSMartin Matuska 		 */
4728e716630dSMartin Matuska 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4729e716630dSMartin Matuska 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4730e716630dSMartin Matuska 			mutex_enter(&vre->vre_lock);
4731e716630dSMartin Matuska 			if (vre->vre_waiting_for_resilver) {
4732e716630dSMartin Matuska 				vdev_dbgmsg(vd, "DTL reassessed, "
4733e716630dSMartin Matuska 				    "continuing raidz expansion");
4734e716630dSMartin Matuska 				vre->vre_waiting_for_resilver = B_FALSE;
4735e716630dSMartin Matuska 				zthr_wakeup(spa->spa_raidz_expand_zthr);
4736e716630dSMartin Matuska 			}
4737e716630dSMartin Matuska 			mutex_exit(&vre->vre_lock);
4738e716630dSMartin Matuska 		}
4739e716630dSMartin Matuska 	}
4740e716630dSMartin Matuska }
4741e716630dSMartin Matuska 
4742e716630dSMartin Matuska int
4743e716630dSMartin Matuska vdev_raidz_attach_check(vdev_t *new_child)
4744e716630dSMartin Matuska {
4745e716630dSMartin Matuska 	vdev_t *raidvd = new_child->vdev_parent;
4746e716630dSMartin Matuska 	uint64_t new_children = raidvd->vdev_children;
4747e716630dSMartin Matuska 
4748e716630dSMartin Matuska 	/*
4749e716630dSMartin Matuska 	 * We use the "boot" space as scratch space to handle overwriting the
4750e716630dSMartin Matuska 	 * initial part of the vdev.  If it is too small, then this expansion
4751e716630dSMartin Matuska 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4752e716630dSMartin Matuska 	 * >200 children).
4753e716630dSMartin Matuska 	 */
4754e716630dSMartin Matuska 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4755e716630dSMartin Matuska 		return (EINVAL);
4756e716630dSMartin Matuska 	}
4757e716630dSMartin Matuska 	return (0);
4758e716630dSMartin Matuska }
4759e716630dSMartin Matuska 
4760e716630dSMartin Matuska void
4761e716630dSMartin Matuska vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4762e716630dSMartin Matuska {
4763e716630dSMartin Matuska 	vdev_t *new_child = arg;
4764e716630dSMartin Matuska 	spa_t *spa = new_child->vdev_spa;
4765e716630dSMartin Matuska 	vdev_t *raidvd = new_child->vdev_parent;
4766e716630dSMartin Matuska 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4767e716630dSMartin Matuska 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4768e716630dSMartin Matuska 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
4769e716630dSMartin Matuska 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4770e716630dSMartin Matuska 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4771e716630dSMartin Matuska 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4772e716630dSMartin Matuska 	    new_child);
4773e716630dSMartin Matuska 
4774e716630dSMartin Matuska 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4775e716630dSMartin Matuska 
4776e716630dSMartin Matuska 	vdrz->vd_physical_width++;
4777e716630dSMartin Matuska 
4778e716630dSMartin Matuska 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4779e716630dSMartin Matuska 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4780e716630dSMartin Matuska 	vdrz->vn_vre.vre_offset = 0;
4781e716630dSMartin Matuska 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4782e716630dSMartin Matuska 	spa->spa_raidz_expand = &vdrz->vn_vre;
4783e716630dSMartin Matuska 	zthr_wakeup(spa->spa_raidz_expand_zthr);
4784e716630dSMartin Matuska 
4785e716630dSMartin Matuska 	/*
4786e716630dSMartin Matuska 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4787e716630dSMartin Matuska 	 * written to the config.
4788e716630dSMartin Matuska 	 */
4789e716630dSMartin Matuska 	vdev_config_dirty(raidvd);
4790e716630dSMartin Matuska 
4791e716630dSMartin Matuska 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
4792e716630dSMartin Matuska 	vdrz->vn_vre.vre_end_time = 0;
4793e716630dSMartin Matuska 	vdrz->vn_vre.vre_state = DSS_SCANNING;
4794e716630dSMartin Matuska 	vdrz->vn_vre.vre_bytes_copied = 0;
4795e716630dSMartin Matuska 
4796e716630dSMartin Matuska 	uint64_t state = vdrz->vn_vre.vre_state;
4797e716630dSMartin Matuska 	VERIFY0(zap_update(spa->spa_meta_objset,
4798e716630dSMartin Matuska 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4799e716630dSMartin Matuska 	    sizeof (state), 1, &state, tx));
4800e716630dSMartin Matuska 
4801e716630dSMartin Matuska 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
4802e716630dSMartin Matuska 	VERIFY0(zap_update(spa->spa_meta_objset,
4803e716630dSMartin Matuska 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4804e716630dSMartin Matuska 	    sizeof (start_time), 1, &start_time, tx));
4805e716630dSMartin Matuska 
4806e716630dSMartin Matuska 	(void) zap_remove(spa->spa_meta_objset,
4807e716630dSMartin Matuska 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4808e716630dSMartin Matuska 	(void) zap_remove(spa->spa_meta_objset,
4809e716630dSMartin Matuska 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4810e716630dSMartin Matuska 
4811e716630dSMartin Matuska 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4812e716630dSMartin Matuska 	    "%s vdev %llu new width %llu", spa_name(spa),
4813e716630dSMartin Matuska 	    (unsigned long long)raidvd->vdev_id,
4814e716630dSMartin Matuska 	    (unsigned long long)raidvd->vdev_children);
4815e716630dSMartin Matuska }
4816e716630dSMartin Matuska 
4817e716630dSMartin Matuska int
4818e716630dSMartin Matuska vdev_raidz_load(vdev_t *vd)
4819e716630dSMartin Matuska {
4820e716630dSMartin Matuska 	vdev_raidz_t *vdrz = vd->vdev_tsd;
4821e716630dSMartin Matuska 	int err;
4822e716630dSMartin Matuska 
4823e716630dSMartin Matuska 	uint64_t state = DSS_NONE;
4824e716630dSMartin Matuska 	uint64_t start_time = 0;
4825e716630dSMartin Matuska 	uint64_t end_time = 0;
4826e716630dSMartin Matuska 	uint64_t bytes_copied = 0;
4827e716630dSMartin Matuska 
4828e716630dSMartin Matuska 	if (vd->vdev_top_zap != 0) {
4829e716630dSMartin Matuska 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4830e716630dSMartin Matuska 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4831e716630dSMartin Matuska 		    sizeof (state), 1, &state);
4832e716630dSMartin Matuska 		if (err != 0 && err != ENOENT)
4833e716630dSMartin Matuska 			return (err);
4834e716630dSMartin Matuska 
4835e716630dSMartin Matuska 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4836e716630dSMartin Matuska 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4837e716630dSMartin Matuska 		    sizeof (start_time), 1, &start_time);
4838e716630dSMartin Matuska 		if (err != 0 && err != ENOENT)
4839e716630dSMartin Matuska 			return (err);
4840e716630dSMartin Matuska 
4841e716630dSMartin Matuska 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4842e716630dSMartin Matuska 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4843e716630dSMartin Matuska 		    sizeof (end_time), 1, &end_time);
4844e716630dSMartin Matuska 		if (err != 0 && err != ENOENT)
4845e716630dSMartin Matuska 			return (err);
4846e716630dSMartin Matuska 
4847e716630dSMartin Matuska 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4848e716630dSMartin Matuska 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4849e716630dSMartin Matuska 		    sizeof (bytes_copied), 1, &bytes_copied);
4850e716630dSMartin Matuska 		if (err != 0 && err != ENOENT)
4851e716630dSMartin Matuska 			return (err);
4852e716630dSMartin Matuska 	}
4853e716630dSMartin Matuska 
4854e716630dSMartin Matuska 	/*
4855e716630dSMartin Matuska 	 * If we are in the middle of expansion, vre_state should have
4856e716630dSMartin Matuska 	 * already been set by vdev_raidz_init().
4857e716630dSMartin Matuska 	 */
4858e716630dSMartin Matuska 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4859e716630dSMartin Matuska 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4860e716630dSMartin Matuska 	vdrz->vn_vre.vre_start_time = start_time;
4861e716630dSMartin Matuska 	vdrz->vn_vre.vre_end_time = end_time;
4862e716630dSMartin Matuska 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4863e716630dSMartin Matuska 
4864e716630dSMartin Matuska 	return (0);
4865e716630dSMartin Matuska }
4866e716630dSMartin Matuska 
4867e716630dSMartin Matuska int
4868e716630dSMartin Matuska spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4869e716630dSMartin Matuska {
4870e716630dSMartin Matuska 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4871e716630dSMartin Matuska 
4872e716630dSMartin Matuska 	if (vre == NULL) {
4873e716630dSMartin Matuska 		/* no removal in progress; find most recent completed */
4874e716630dSMartin Matuska 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4875e716630dSMartin Matuska 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4876e716630dSMartin Matuska 			if (vd->vdev_ops == &vdev_raidz_ops) {
4877e716630dSMartin Matuska 				vdev_raidz_t *vdrz = vd->vdev_tsd;
4878e716630dSMartin Matuska 
4879e716630dSMartin Matuska 				if (vdrz->vn_vre.vre_end_time != 0 &&
4880e716630dSMartin Matuska 				    (vre == NULL ||
4881e716630dSMartin Matuska 				    vdrz->vn_vre.vre_end_time >
4882e716630dSMartin Matuska 				    vre->vre_end_time)) {
4883e716630dSMartin Matuska 					vre = &vdrz->vn_vre;
4884e716630dSMartin Matuska 				}
4885e716630dSMartin Matuska 			}
4886e716630dSMartin Matuska 		}
4887e716630dSMartin Matuska 	}
4888e716630dSMartin Matuska 
4889e716630dSMartin Matuska 	if (vre == NULL) {
4890e716630dSMartin Matuska 		return (SET_ERROR(ENOENT));
4891e716630dSMartin Matuska 	}
4892e716630dSMartin Matuska 
4893e716630dSMartin Matuska 	pres->pres_state = vre->vre_state;
4894e716630dSMartin Matuska 	pres->pres_expanding_vdev = vre->vre_vdev_id;
4895e716630dSMartin Matuska 
4896e716630dSMartin Matuska 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4897e716630dSMartin Matuska 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4898e716630dSMartin Matuska 
4899e716630dSMartin Matuska 	mutex_enter(&vre->vre_lock);
4900e716630dSMartin Matuska 	pres->pres_reflowed = vre->vre_bytes_copied;
4901e716630dSMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++)
4902e716630dSMartin Matuska 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4903e716630dSMartin Matuska 	mutex_exit(&vre->vre_lock);
4904e716630dSMartin Matuska 
4905e716630dSMartin Matuska 	pres->pres_start_time = vre->vre_start_time;
4906e716630dSMartin Matuska 	pres->pres_end_time = vre->vre_end_time;
4907e716630dSMartin Matuska 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4908e716630dSMartin Matuska 
4909e716630dSMartin Matuska 	return (0);
4910e716630dSMartin Matuska }
4911e716630dSMartin Matuska 
49127877fdebSMatt Macy /*
49137877fdebSMatt Macy  * Initialize private RAIDZ specific fields from the nvlist.
49147877fdebSMatt Macy  */
49157877fdebSMatt Macy static int
49167877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
49177877fdebSMatt Macy {
49187877fdebSMatt Macy 	uint_t children;
49197877fdebSMatt Macy 	nvlist_t **child;
49207877fdebSMatt Macy 	int error = nvlist_lookup_nvlist_array(nv,
49217877fdebSMatt Macy 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
49227877fdebSMatt Macy 	if (error != 0)
49237877fdebSMatt Macy 		return (SET_ERROR(EINVAL));
49247877fdebSMatt Macy 
4925e716630dSMartin Matuska 	uint64_t nparity;
49267877fdebSMatt Macy 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
49277877fdebSMatt Macy 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
49287877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
49297877fdebSMatt Macy 
49307877fdebSMatt Macy 		/*
49317877fdebSMatt Macy 		 * Previous versions could only support 1 or 2 parity
49327877fdebSMatt Macy 		 * device.
49337877fdebSMatt Macy 		 */
49347877fdebSMatt Macy 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
49357877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
49367877fdebSMatt Macy 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
49377877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
49387877fdebSMatt Macy 	} else {
49397877fdebSMatt Macy 		/*
49407877fdebSMatt Macy 		 * We require the parity to be specified for SPAs that
49417877fdebSMatt Macy 		 * support multiple parity levels.
49427877fdebSMatt Macy 		 */
49437877fdebSMatt Macy 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
49447877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
49457877fdebSMatt Macy 
49467877fdebSMatt Macy 		/*
49477877fdebSMatt Macy 		 * Otherwise, we default to 1 parity device for RAID-Z.
49487877fdebSMatt Macy 		 */
49497877fdebSMatt Macy 		nparity = 1;
49507877fdebSMatt Macy 	}
49517877fdebSMatt Macy 
4952e716630dSMartin Matuska 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4953e716630dSMartin Matuska 	vdrz->vn_vre.vre_vdev_id = -1;
4954e716630dSMartin Matuska 	vdrz->vn_vre.vre_offset = UINT64_MAX;
4955e716630dSMartin Matuska 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4956e716630dSMartin Matuska 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4957e716630dSMartin Matuska 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4958e716630dSMartin Matuska 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4959e716630dSMartin Matuska 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4960e716630dSMartin Matuska 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4961e716630dSMartin Matuska 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4962e716630dSMartin Matuska 
4963e716630dSMartin Matuska 	vdrz->vd_physical_width = children;
49647877fdebSMatt Macy 	vdrz->vd_nparity = nparity;
49657877fdebSMatt Macy 
4966e716630dSMartin Matuska 	/* note, the ID does not exist when creating a pool */
4967e716630dSMartin Matuska 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4968e716630dSMartin Matuska 	    &vdrz->vn_vre.vre_vdev_id);
4969e716630dSMartin Matuska 
4970e716630dSMartin Matuska 	boolean_t reflow_in_progress =
4971e716630dSMartin Matuska 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4972e716630dSMartin Matuska 	if (reflow_in_progress) {
4973e716630dSMartin Matuska 		spa->spa_raidz_expand = &vdrz->vn_vre;
4974e716630dSMartin Matuska 		vdrz->vn_vre.vre_state = DSS_SCANNING;
4975e716630dSMartin Matuska 	}
4976e716630dSMartin Matuska 
4977e716630dSMartin Matuska 	vdrz->vd_original_width = children;
4978e716630dSMartin Matuska 	uint64_t *txgs;
4979e716630dSMartin Matuska 	unsigned int txgs_size = 0;
4980e716630dSMartin Matuska 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4981e716630dSMartin Matuska 	    &txgs, &txgs_size);
4982e716630dSMartin Matuska 	if (error == 0) {
4983e716630dSMartin Matuska 		for (int i = 0; i < txgs_size; i++) {
4984e716630dSMartin Matuska 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4985e716630dSMartin Matuska 			re->re_txg = txgs[txgs_size - i - 1];
4986e716630dSMartin Matuska 			re->re_logical_width = vdrz->vd_physical_width - i;
4987e716630dSMartin Matuska 
4988e716630dSMartin Matuska 			if (reflow_in_progress)
4989e716630dSMartin Matuska 				re->re_logical_width--;
4990e716630dSMartin Matuska 
4991e716630dSMartin Matuska 			avl_add(&vdrz->vd_expand_txgs, re);
4992e716630dSMartin Matuska 		}
4993e716630dSMartin Matuska 
4994e716630dSMartin Matuska 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4995e716630dSMartin Matuska 	}
4996e716630dSMartin Matuska 	if (reflow_in_progress) {
4997e716630dSMartin Matuska 		vdrz->vd_original_width--;
4998e716630dSMartin Matuska 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4999e716630dSMartin Matuska 		    children, txgs_size);
5000e716630dSMartin Matuska 	}
5001e716630dSMartin Matuska 
50027877fdebSMatt Macy 	*tsd = vdrz;
50037877fdebSMatt Macy 
50047877fdebSMatt Macy 	return (0);
50057877fdebSMatt Macy }
50067877fdebSMatt Macy 
50077877fdebSMatt Macy static void
50087877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd)
50097877fdebSMatt Macy {
5010e716630dSMartin Matuska 	vdev_raidz_t *vdrz = vd->vdev_tsd;
5011e716630dSMartin Matuska 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
5012e716630dSMartin Matuska 		vd->vdev_spa->spa_raidz_expand = NULL;
5013e716630dSMartin Matuska 	reflow_node_t *re;
5014e716630dSMartin Matuska 	void *cookie = NULL;
5015e716630dSMartin Matuska 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
5016e716630dSMartin Matuska 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
5017e716630dSMartin Matuska 		kmem_free(re, sizeof (*re));
5018e716630dSMartin Matuska 	avl_destroy(&vdrz->vd_expand_txgs);
5019e716630dSMartin Matuska 	mutex_destroy(&vdrz->vd_expand_lock);
5020e716630dSMartin Matuska 	mutex_destroy(&vdrz->vn_vre.vre_lock);
5021e716630dSMartin Matuska 	cv_destroy(&vdrz->vn_vre.vre_cv);
5022e716630dSMartin Matuska 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
5023e716630dSMartin Matuska 	kmem_free(vdrz, sizeof (*vdrz));
50247877fdebSMatt Macy }
50257877fdebSMatt Macy 
50267877fdebSMatt Macy /*
50277877fdebSMatt Macy  * Add RAIDZ specific fields to the config nvlist.
50287877fdebSMatt Macy  */
50297877fdebSMatt Macy static void
50307877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
50317877fdebSMatt Macy {
50327877fdebSMatt Macy 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
50337877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
50347877fdebSMatt Macy 
50357877fdebSMatt Macy 	/*
50367877fdebSMatt Macy 	 * Make sure someone hasn't managed to sneak a fancy new vdev
50377877fdebSMatt Macy 	 * into a crufty old storage pool.
50387877fdebSMatt Macy 	 */
50397877fdebSMatt Macy 	ASSERT(vdrz->vd_nparity == 1 ||
50407877fdebSMatt Macy 	    (vdrz->vd_nparity <= 2 &&
50417877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
50427877fdebSMatt Macy 	    (vdrz->vd_nparity <= 3 &&
50437877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
50447877fdebSMatt Macy 
50457877fdebSMatt Macy 	/*
50467877fdebSMatt Macy 	 * Note that we'll add these even on storage pools where they
50477877fdebSMatt Macy 	 * aren't strictly required -- older software will just ignore
50487877fdebSMatt Macy 	 * it.
50497877fdebSMatt Macy 	 */
50507877fdebSMatt Macy 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
5051e716630dSMartin Matuska 
5052e716630dSMartin Matuska 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
5053e716630dSMartin Matuska 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
5054e716630dSMartin Matuska 	}
5055e716630dSMartin Matuska 
5056e716630dSMartin Matuska 	mutex_enter(&vdrz->vd_expand_lock);
5057e716630dSMartin Matuska 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
5058e716630dSMartin Matuska 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
5059e716630dSMartin Matuska 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
5060e716630dSMartin Matuska 		    KM_SLEEP);
5061e716630dSMartin Matuska 		uint64_t i = 0;
5062e716630dSMartin Matuska 
5063e716630dSMartin Matuska 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
5064e716630dSMartin Matuska 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
5065e716630dSMartin Matuska 			txgs[i++] = re->re_txg;
5066e716630dSMartin Matuska 		}
5067e716630dSMartin Matuska 
5068e716630dSMartin Matuska 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
5069e716630dSMartin Matuska 		    txgs, count);
5070e716630dSMartin Matuska 
5071e716630dSMartin Matuska 		kmem_free(txgs, sizeof (uint64_t) * count);
5072e716630dSMartin Matuska 	}
5073e716630dSMartin Matuska 	mutex_exit(&vdrz->vd_expand_lock);
50747877fdebSMatt Macy }
50757877fdebSMatt Macy 
50767877fdebSMatt Macy static uint64_t
50777877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd)
50787877fdebSMatt Macy {
50797877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
50807877fdebSMatt Macy 	return (vdrz->vd_nparity);
50817877fdebSMatt Macy }
50827877fdebSMatt Macy 
50837877fdebSMatt Macy static uint64_t
50847877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd)
50857877fdebSMatt Macy {
50867877fdebSMatt Macy 	return (vd->vdev_children);
5087eda14cbcSMatt Macy }
5088eda14cbcSMatt Macy 
5089eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = {
50907877fdebSMatt Macy 	.vdev_op_init = vdev_raidz_init,
50917877fdebSMatt Macy 	.vdev_op_fini = vdev_raidz_fini,
5092eda14cbcSMatt Macy 	.vdev_op_open = vdev_raidz_open,
5093eda14cbcSMatt Macy 	.vdev_op_close = vdev_raidz_close,
5094eda14cbcSMatt Macy 	.vdev_op_asize = vdev_raidz_asize,
50957877fdebSMatt Macy 	.vdev_op_min_asize = vdev_raidz_min_asize,
50967877fdebSMatt Macy 	.vdev_op_min_alloc = NULL,
5097eda14cbcSMatt Macy 	.vdev_op_io_start = vdev_raidz_io_start,
5098eda14cbcSMatt Macy 	.vdev_op_io_done = vdev_raidz_io_done,
5099eda14cbcSMatt Macy 	.vdev_op_state_change = vdev_raidz_state_change,
5100eda14cbcSMatt Macy 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
5101eda14cbcSMatt Macy 	.vdev_op_hold = NULL,
5102eda14cbcSMatt Macy 	.vdev_op_rele = NULL,
5103eda14cbcSMatt Macy 	.vdev_op_remap = NULL,
5104eda14cbcSMatt Macy 	.vdev_op_xlate = vdev_raidz_xlate,
51057877fdebSMatt Macy 	.vdev_op_rebuild_asize = NULL,
51067877fdebSMatt Macy 	.vdev_op_metaslab_init = NULL,
51077877fdebSMatt Macy 	.vdev_op_config_generate = vdev_raidz_config_generate,
51087877fdebSMatt Macy 	.vdev_op_nparity = vdev_raidz_nparity,
51097877fdebSMatt Macy 	.vdev_op_ndisks = vdev_raidz_ndisks,
5110eda14cbcSMatt Macy 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5111eda14cbcSMatt Macy 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5112eda14cbcSMatt Macy };
5113e716630dSMartin Matuska 
5114e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5115e716630dSMartin Matuska 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5116e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5117e716630dSMartin Matuska 	"Max amount of concurrent i/o for RAIDZ expansion");
5118e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5119e716630dSMartin Matuska 	"For expanded RAIDZ, aggregate reads that have more rows than this");
5120e716630dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5121e716630dSMartin Matuska 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5122e716630dSMartin Matuska 	"completes");
5123